mirror of
https://github.com/amd/blis.git
synced 2026-03-23 18:57:21 +00:00
Added extensive support for configuration defaults.
Details:
- Standard names for reference kernels (levels-1v, -1f and 3) are now
macro constants. Examples:
BLIS_SAXPYV_KERNEL_REF
BLIS_DDOTXF_KERNEL_REF
BLIS_ZGEMM_UKERNEL_REF
- Developers no longer have to name all datatype instances of a kernel
with a common base name; [sdcz] datatype flavors of each kernel or
micro-kernel (level-1v, -1f, or 3) may now be named independently.
This means you can now, if you wish, encode the datatype-specific
register blocksizes in the name of the micro-kernel functions.
- Any datatype instances of any kernel (1v, 1f, or 3) that is left
undefined in bli_kernel.h will default to the corresponding reference
implementation. For example, if BLIS_DGEMM_UKERNEL is left undefined,
it will be defined to be BLIS_DGEMM_UKERNEL_REF.
- Developers no longer need to name level-1v/-1f kernels with multiple
datatype chars to match the number of types the kernel WOULD take in
a mixed type environment, as in bli_dddaxpyv_opt(). Now, one char is
sufficient, as in bli_daxpyv_opt().
- There is no longer a need to define an obj_t wrapper to go along with
your level-1v/-1f kernels. The framework now prvides a _kernel()
function which serves as the obj_t wrapper for whatever kernels are
specified (or defaulted to) via bli_kernel.h
- Developers no longer need to prototype their kernels, and thus no
longer need to include any prototyping headers from within
bli_kernel.h. The framework now generates kernel prototypes, with the
proper type signature, based on the kernel names defined (or defaulted
to) via bli_kernel.h.
- If the complex datatype x (of [cz]) implementation of the gemm micro-
kernel is left undefined by bli_kernel.h, but its same-precision real
domain equivalent IS defined, BLIS will use a 4m-based implementation
for the datatype x implementations of all level-3 operations, using
only the real gemm micro-kernel.
This commit is contained in:
2
Makefile
2
Makefile
@@ -328,10 +328,12 @@ MK_BLIS_CONFIG_OBJS := $(patsubst $(CONFIG_PATH)/%.S, $(BASE_OBJ_CONFIG
|
||||
$(filter %.S, $(MK_CONFIG_SRC)))
|
||||
MK_BLIS_CONFIG_OBJS += $(patsubst $(CONFIG_PATH)/%.c, $(BASE_OBJ_CONFIG_PATH)/%.o, \
|
||||
$(filter %.c, $(MK_CONFIG_SRC)))
|
||||
|
||||
MK_BLIS_CONFIG_NOOPT_OBJS := $(patsubst $(CONFIG_PATH)/%.S, $(BASE_OBJ_CONFIG_PATH)/%.o, \
|
||||
$(filter %.S, $(MK_CONFIG_NOOPT_SRC)))
|
||||
MK_BLIS_CONFIG_NOOPT_OBJS += $(patsubst $(CONFIG_PATH)/%.c, $(BASE_OBJ_CONFIG_PATH)/%.o, \
|
||||
$(filter %.c, $(MK_CONFIG_NOOPT_SRC)))
|
||||
|
||||
MK_BLIS_CONFIG_KERNELS_OBJS := $(patsubst $(CONFIG_PATH)/%.S, $(BASE_OBJ_CONFIG_PATH)/%.o, \
|
||||
$(filter %.S, $(MK_CONFIG_KERNELS_SRC)))
|
||||
MK_BLIS_CONFIG_KERNELS_OBJS += $(patsubst $(CONFIG_PATH)/%.c, $(BASE_OBJ_CONFIG_PATH)/%.o, \
|
||||
|
||||
@@ -38,7 +38,7 @@
|
||||
|
||||
// -- LEVEL-3 MICRO-KERNEL CONSTANTS -------------------------------------------
|
||||
|
||||
// -- Default cache blocksizes --
|
||||
// -- Cache blocksizes --
|
||||
|
||||
//
|
||||
// Constraints:
|
||||
@@ -70,35 +70,7 @@
|
||||
#define BLIS_DEFAULT_KC_Z 128
|
||||
#define BLIS_DEFAULT_NC_Z 4096
|
||||
|
||||
// -- Cache blocksize extensions (for optimizing edge cases) --
|
||||
|
||||
// NOTE: These cache blocksize "extensions" have the same constraints as
|
||||
// the corresponding default blocksizes above. When these values are
|
||||
// non-zero, blocksizes used at edge cases are extended (enlarged) if
|
||||
// such an extension would encompass the remaining portion of the
|
||||
// matrix dimension.
|
||||
|
||||
#define BLIS_EXTEND_MC_S 0 //(BLIS_DEFAULT_MC_S/4)
|
||||
#define BLIS_EXTEND_KC_S 0 //(BLIS_DEFAULT_KC_S/4)
|
||||
#define BLIS_EXTEND_NC_S 0 //(BLIS_DEFAULT_NC_S/4)
|
||||
|
||||
#define BLIS_EXTEND_MC_D 0 //(BLIS_DEFAULT_MC_D/4)
|
||||
#define BLIS_EXTEND_KC_D 0 //(BLIS_DEFAULT_KC_D/4)
|
||||
#define BLIS_EXTEND_NC_D 0 //(BLIS_DEFAULT_NC_D/4)
|
||||
|
||||
#define BLIS_EXTEND_MC_C 0 //(BLIS_DEFAULT_MC_C/4)
|
||||
#define BLIS_EXTEND_KC_C 0 //(BLIS_DEFAULT_KC_C/4)
|
||||
#define BLIS_EXTEND_NC_C 0 //(BLIS_DEFAULT_NC_C/4)
|
||||
|
||||
#define BLIS_EXTEND_MC_Z 0 //(BLIS_DEFAULT_MC_Z/4)
|
||||
#define BLIS_EXTEND_KC_Z 0 //(BLIS_DEFAULT_KC_Z/4)
|
||||
#define BLIS_EXTEND_NC_Z 0 //(BLIS_DEFAULT_NC_Z/4)
|
||||
|
||||
// -- Default register blocksizes for micro-kernel --
|
||||
|
||||
// NOTE: When using the reference configuration, these register blocksizes
|
||||
// in the m and n dimensions should all be equal to the size expected by
|
||||
// the reference micro-kernel(s).
|
||||
// -- Register blocksizes --
|
||||
|
||||
#define BLIS_DEFAULT_MR_S 4
|
||||
#define BLIS_DEFAULT_NR_S 4
|
||||
@@ -116,10 +88,34 @@
|
||||
// of f, handles leftover edge cases (ie: when k % f > 0) then these
|
||||
// register blocksizes in the k dimension can be defined to 1.
|
||||
|
||||
#define BLIS_DEFAULT_KR_S 1
|
||||
#define BLIS_DEFAULT_KR_D 1
|
||||
#define BLIS_DEFAULT_KR_C 1
|
||||
#define BLIS_DEFAULT_KR_Z 1
|
||||
//#define BLIS_DEFAULT_KR_S 1
|
||||
//#define BLIS_DEFAULT_KR_D 1
|
||||
//#define BLIS_DEFAULT_KR_C 1
|
||||
//#define BLIS_DEFAULT_KR_Z 1
|
||||
|
||||
// -- Cache blocksize extensions (for optimizing edge cases) --
|
||||
|
||||
// NOTE: These cache blocksize "extensions" have the same constraints as
|
||||
// the corresponding default blocksizes above. When these values are
|
||||
// non-zero, blocksizes used at edge cases are extended (enlarged) if
|
||||
// such an extension would encompass the remaining portion of the
|
||||
// matrix dimension.
|
||||
|
||||
//#define BLIS_EXTEND_MC_S 0 //(BLIS_DEFAULT_MC_S/4)
|
||||
//#define BLIS_EXTEND_KC_S 0 //(BLIS_DEFAULT_KC_S/4)
|
||||
//#define BLIS_EXTEND_NC_S 0 //(BLIS_DEFAULT_NC_S/4)
|
||||
|
||||
//#define BLIS_EXTEND_MC_D 0 //(BLIS_DEFAULT_MC_D/4)
|
||||
//#define BLIS_EXTEND_KC_D 0 //(BLIS_DEFAULT_KC_D/4)
|
||||
//#define BLIS_EXTEND_NC_D 0 //(BLIS_DEFAULT_NC_D/4)
|
||||
|
||||
//#define BLIS_EXTEND_MC_C 0 //(BLIS_DEFAULT_MC_C/4)
|
||||
//#define BLIS_EXTEND_KC_C 0 //(BLIS_DEFAULT_KC_C/4)
|
||||
//#define BLIS_EXTEND_NC_C 0 //(BLIS_DEFAULT_NC_C/4)
|
||||
|
||||
//#define BLIS_EXTEND_MC_Z 0 //(BLIS_DEFAULT_MC_Z/4)
|
||||
//#define BLIS_EXTEND_KC_Z 0 //(BLIS_DEFAULT_KC_Z/4)
|
||||
//#define BLIS_EXTEND_NC_Z 0 //(BLIS_DEFAULT_NC_Z/4)
|
||||
|
||||
// -- Register blocksize extensions (for packed micro-panels) --
|
||||
|
||||
@@ -127,105 +123,28 @@
|
||||
// leading dimensions used within the packed micro-panels are equal to
|
||||
// or greater than their corresponding register blocksizes above.
|
||||
|
||||
#define BLIS_EXTEND_MR_S 0
|
||||
#define BLIS_EXTEND_NR_S 0
|
||||
//#define BLIS_EXTEND_MR_S 0
|
||||
//#define BLIS_EXTEND_NR_S 0
|
||||
|
||||
#define BLIS_EXTEND_MR_D 0
|
||||
#define BLIS_EXTEND_NR_D 0
|
||||
//#define BLIS_EXTEND_MR_D 0
|
||||
//#define BLIS_EXTEND_NR_D 0
|
||||
|
||||
#define BLIS_EXTEND_MR_C 0
|
||||
#define BLIS_EXTEND_NR_C 0
|
||||
//#define BLIS_EXTEND_MR_C 0
|
||||
//#define BLIS_EXTEND_NR_C 0
|
||||
|
||||
#define BLIS_EXTEND_MR_Z 0
|
||||
#define BLIS_EXTEND_NR_Z 0
|
||||
//#define BLIS_EXTEND_MR_Z 0
|
||||
//#define BLIS_EXTEND_NR_Z 0
|
||||
|
||||
// Register blocksize extensions in the k dimension are not used.
|
||||
|
||||
#define BLIS_EXTEND_KR_S 0
|
||||
#define BLIS_EXTEND_KR_D 0
|
||||
#define BLIS_EXTEND_KR_C 0
|
||||
#define BLIS_EXTEND_KR_Z 0
|
||||
|
||||
// -- Default incremental packing blocksizes (n dimension) --
|
||||
|
||||
// NOTE: These incremental packing blocksizes (for the n dimension) are only
|
||||
// used by certain blocked variants. But when the *are* used, they MUST be
|
||||
// be an integer multiple of NR!
|
||||
|
||||
#define BLIS_DEFAULT_NI_FAC 16
|
||||
#define BLIS_DEFAULT_NI_S (BLIS_DEFAULT_NI_FAC * BLIS_DEFAULT_NR_S)
|
||||
#define BLIS_DEFAULT_NI_D (BLIS_DEFAULT_NI_FAC * BLIS_DEFAULT_NR_D)
|
||||
#define BLIS_DEFAULT_NI_C (BLIS_DEFAULT_NI_FAC * BLIS_DEFAULT_NR_C)
|
||||
#define BLIS_DEFAULT_NI_Z (BLIS_DEFAULT_NI_FAC * BLIS_DEFAULT_NR_Z)
|
||||
|
||||
|
||||
|
||||
// -- LEVEL-2 KERNEL CONSTANTS -------------------------------------------------
|
||||
|
||||
// NOTE: These values determine high-level cache blocking for level-2
|
||||
// operations ONLY. So, if gemv is performed with a 2000x2000 matrix A and
|
||||
// MC = NC = 1000, then a total of four unblocked (or unblocked fused)
|
||||
// gemv subproblems are called. The blocked algorithms are only useful in
|
||||
// that they provide the opportunity for packing vectors. (Matrices can also
|
||||
// be packed here, but this tends to be much too expensive in practice to
|
||||
// actually employ.)
|
||||
|
||||
#define BLIS_DEFAULT_L2_MC_S 1000
|
||||
#define BLIS_DEFAULT_L2_NC_S 1000
|
||||
|
||||
#define BLIS_DEFAULT_L2_MC_D 1000
|
||||
#define BLIS_DEFAULT_L2_NC_D 1000
|
||||
|
||||
#define BLIS_DEFAULT_L2_MC_C 1000
|
||||
#define BLIS_DEFAULT_L2_NC_C 1000
|
||||
|
||||
#define BLIS_DEFAULT_L2_MC_Z 1000
|
||||
#define BLIS_DEFAULT_L2_NC_Z 1000
|
||||
|
||||
|
||||
|
||||
// -- LEVEL-1F KERNEL CONSTANTS ------------------------------------------------
|
||||
|
||||
// -- Default fusing factors for level-1f operations --
|
||||
|
||||
// NOTE: Default fusing factors are not used by the reference implementations
|
||||
// of level-1f operations. They are here only for use when these operations
|
||||
// are optimized.
|
||||
|
||||
#define BLIS_DEFAULT_FUSE_FAC_S 8
|
||||
#define BLIS_DEFAULT_FUSE_FAC_D 4
|
||||
#define BLIS_DEFAULT_FUSE_FAC_C 4
|
||||
#define BLIS_DEFAULT_FUSE_FAC_Z 2
|
||||
|
||||
#define BLIS_AXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
|
||||
#define BLIS_AXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
|
||||
#define BLIS_AXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
|
||||
#define BLIS_AXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
|
||||
|
||||
#define BLIS_DOTXF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
|
||||
#define BLIS_DOTXF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
|
||||
#define BLIS_DOTXF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
|
||||
#define BLIS_DOTXF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
|
||||
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
|
||||
|
||||
|
||||
|
||||
// -- LEVEL-1V KERNEL CONSTANTS ------------------------------------------------
|
||||
|
||||
// -- Default register blocksizes for vectors --
|
||||
|
||||
// NOTE: Register blocksizes for vectors are used when packing
|
||||
// non-contiguous vectors. Similar to that of KR, they can
|
||||
// typically be set to 1.
|
||||
|
||||
#define BLIS_DEFAULT_VR_S 1
|
||||
#define BLIS_DEFAULT_VR_D 1
|
||||
#define BLIS_DEFAULT_VR_C 1
|
||||
#define BLIS_DEFAULT_VR_Z 1
|
||||
|
||||
|
||||
|
||||
@@ -233,16 +152,13 @@
|
||||
|
||||
// -- gemm --
|
||||
|
||||
#include "bli_gemm_opt_4x4.h"
|
||||
#define GEMM_UKERNEL gemm_opt_4x4
|
||||
#define BLIS_SGEMM_UKERNEL bli_sgemm_opt_4x4
|
||||
#define BLIS_DGEMM_UKERNEL bli_dgemm_opt_4x4
|
||||
#define BLIS_CGEMM_UKERNEL bli_cgemm_opt_4x4
|
||||
#define BLIS_ZGEMM_UKERNEL bli_zgemm_opt_4x4
|
||||
|
||||
// -- trsm-related --
|
||||
|
||||
#define GEMMTRSM_L_UKERNEL gemmtrsm_l_ref_mxn
|
||||
#define GEMMTRSM_U_UKERNEL gemmtrsm_u_ref_mxn
|
||||
|
||||
#define TRSM_L_UKERNEL trsm_l_ref_mxn
|
||||
#define TRSM_U_UKERNEL trsm_u_ref_mxn
|
||||
|
||||
|
||||
|
||||
@@ -250,25 +166,8 @@
|
||||
|
||||
// -- packm --
|
||||
|
||||
#define PACKM_2XK_KERNEL packm_ref_2xk
|
||||
#define PACKM_4XK_KERNEL packm_ref_4xk
|
||||
#define PACKM_6XK_KERNEL packm_ref_6xk
|
||||
#define PACKM_8XK_KERNEL packm_ref_8xk
|
||||
#define PACKM_10XK_KERNEL packm_ref_10xk
|
||||
#define PACKM_12XK_KERNEL packm_ref_12xk
|
||||
#define PACKM_14XK_KERNEL packm_ref_14xk
|
||||
#define PACKM_16XK_KERNEL packm_ref_16xk
|
||||
|
||||
// -- unpackm --
|
||||
|
||||
#define UNPACKM_2XK_KERNEL unpackm_ref_2xk
|
||||
#define UNPACKM_4XK_KERNEL unpackm_ref_4xk
|
||||
#define UNPACKM_6XK_KERNEL unpackm_ref_6xk
|
||||
#define UNPACKM_8XK_KERNEL unpackm_ref_8xk
|
||||
#define UNPACKM_10XK_KERNEL unpackm_ref_10xk
|
||||
#define UNPACKM_12XK_KERNEL unpackm_ref_12xk
|
||||
#define UNPACKM_14XK_KERNEL unpackm_ref_14xk
|
||||
#define UNPACKM_16XK_KERNEL unpackm_ref_16xk
|
||||
|
||||
|
||||
|
||||
@@ -276,23 +175,14 @@
|
||||
|
||||
// -- axpy2v --
|
||||
|
||||
#define AXPY2V_KERNEL axpy2v_unb_var1
|
||||
|
||||
// -- dotaxpyv --
|
||||
|
||||
#define DOTAXPYV_KERNEL dotaxpyv_unb_var1
|
||||
|
||||
// -- axpyf --
|
||||
|
||||
#define AXPYF_KERNEL axpyf_unb_var1
|
||||
|
||||
// -- dotxf --
|
||||
|
||||
#define DOTXF_KERNEL dotxf_unb_var1
|
||||
|
||||
// -- dotxaxpyf --
|
||||
|
||||
#define DOTXAXPYF_KERNEL dotxaxpyf_unb_var1
|
||||
|
||||
|
||||
|
||||
@@ -300,48 +190,26 @@
|
||||
|
||||
// -- addv --
|
||||
|
||||
#define ADDV_KERNEL addv_unb_var1
|
||||
|
||||
// -- axpyv --
|
||||
|
||||
#define AXPYV_KERNEL axpyv_unb_var1
|
||||
|
||||
// -- copyv --
|
||||
|
||||
#define COPYV_KERNEL copyv_unb_var1
|
||||
|
||||
// -- dotv --
|
||||
|
||||
#define DOTV_KERNEL dotv_unb_var1
|
||||
|
||||
// -- dotxv --
|
||||
|
||||
#define DOTXV_KERNEL dotxv_unb_var1
|
||||
|
||||
// -- invertv --
|
||||
|
||||
#define INVERTV_KERNEL invertv_unb_var1
|
||||
|
||||
// -- scal2v --
|
||||
|
||||
#define SCAL2V_KERNEL scal2v_unb_var1
|
||||
|
||||
// -- scalv --
|
||||
|
||||
#define SCALV_KERNEL scalv_unb_var1
|
||||
|
||||
// -- setv --
|
||||
|
||||
#define SETV_KERNEL setv_unb_var1
|
||||
|
||||
// -- subv --
|
||||
|
||||
#define SUBV_KERNEL subv_unb_var1
|
||||
|
||||
// -- swapv --
|
||||
|
||||
#define SWAPV_KERNEL swapv_unb_var1
|
||||
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
1
config/armv7a/kernels
Symbolic link
1
config/armv7a/kernels
Symbolic link
@@ -0,0 +1 @@
|
||||
../../kernels/armv7a
|
||||
@@ -1,53 +0,0 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
// #include "arm_neon.h"
|
||||
|
||||
|
||||
#undef GENTPROT
|
||||
#define GENTPROT( ctype, ch, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
dim_t k, \
|
||||
ctype* restrict alpha, \
|
||||
ctype* restrict a, \
|
||||
ctype* restrict b, \
|
||||
ctype* restrict beta, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
auxinfo_t* data \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT_BASIC( gemm_opt_4x4 )
|
||||
|
||||
@@ -36,6 +36,9 @@
|
||||
#define BLIS_CONFIG_H
|
||||
|
||||
|
||||
#undef restrict
|
||||
|
||||
|
||||
// -- OPERATING SYSTEM ---------------------------------------------------------
|
||||
|
||||
|
||||
|
||||
@@ -38,7 +38,7 @@
|
||||
|
||||
// -- LEVEL-3 MICRO-KERNEL CONSTANTS -------------------------------------------
|
||||
|
||||
// -- Default cache blocksizes --
|
||||
// -- Cache blocksizes --
|
||||
|
||||
//
|
||||
// Constraints:
|
||||
@@ -76,35 +76,7 @@
|
||||
#define BLIS_DEFAULT_KC_Z 256
|
||||
#define BLIS_DEFAULT_NC_Z 2048
|
||||
|
||||
// -- Cache blocksize extensions (for optimizing edge cases) --
|
||||
|
||||
// NOTE: These cache blocksize "extensions" have the same constraints as
|
||||
// the corresponding default blocksizes above. When these values are
|
||||
// non-zero, blocksizes used at edge cases are extended (enlarged) if
|
||||
// such an extension would encompass the remaining portion of the
|
||||
// matrix dimension.
|
||||
|
||||
#define BLIS_EXTEND_MC_S 0 //(BLIS_DEFAULT_MC_S/4)
|
||||
#define BLIS_EXTEND_KC_S 0 //(BLIS_DEFAULT_KC_S/4)
|
||||
#define BLIS_EXTEND_NC_S 0 //(BLIS_DEFAULT_NC_S/4)
|
||||
|
||||
#define BLIS_EXTEND_MC_D 0 //(BLIS_DEFAULT_MC_D/4)
|
||||
#define BLIS_EXTEND_KC_D 0 //(BLIS_DEFAULT_KC_D/4)
|
||||
#define BLIS_EXTEND_NC_D 0 //(BLIS_DEFAULT_NC_D/4)
|
||||
|
||||
#define BLIS_EXTEND_MC_C 0 //(BLIS_DEFAULT_MC_C/4)
|
||||
#define BLIS_EXTEND_KC_C 0 //(BLIS_DEFAULT_KC_C/4)
|
||||
#define BLIS_EXTEND_NC_C 0 //(BLIS_DEFAULT_NC_C/4)
|
||||
|
||||
#define BLIS_EXTEND_MC_Z 0 //(BLIS_DEFAULT_MC_Z/4)
|
||||
#define BLIS_EXTEND_KC_Z 0 //(BLIS_DEFAULT_KC_Z/4)
|
||||
#define BLIS_EXTEND_NC_Z 0 //(BLIS_DEFAULT_NC_Z/4)
|
||||
|
||||
// -- Default register blocksizes for micro-kernel --
|
||||
|
||||
// NOTE: When using the reference configuration, these register blocksizes
|
||||
// in the m and n dimensions should all be equal to the size expected by
|
||||
// the reference micro-kernel(s).
|
||||
// -- Register blocksizes --
|
||||
|
||||
#define BLIS_DEFAULT_MR_S 8
|
||||
#define BLIS_DEFAULT_NR_S 4
|
||||
@@ -122,10 +94,34 @@
|
||||
// of f, handles leftover edge cases (ie: when k % f > 0) then these
|
||||
// register blocksizes in the k dimension can be defined to 1.
|
||||
|
||||
#define BLIS_DEFAULT_KR_S 1
|
||||
#define BLIS_DEFAULT_KR_D 1
|
||||
#define BLIS_DEFAULT_KR_C 1
|
||||
#define BLIS_DEFAULT_KR_Z 1
|
||||
//#define BLIS_DEFAULT_KR_S 1
|
||||
//#define BLIS_DEFAULT_KR_D 1
|
||||
//#define BLIS_DEFAULT_KR_C 1
|
||||
//#define BLIS_DEFAULT_KR_Z 1
|
||||
|
||||
// -- Cache blocksize extensions (for optimizing edge cases) --
|
||||
|
||||
// NOTE: These cache blocksize "extensions" have the same constraints as
|
||||
// the corresponding default blocksizes above. When these values are
|
||||
// non-zero, blocksizes used at edge cases are extended (enlarged) if
|
||||
// such an extension would encompass the remaining portion of the
|
||||
// matrix dimension.
|
||||
|
||||
//#define BLIS_EXTEND_MC_S 0 //(BLIS_DEFAULT_MC_S/4)
|
||||
//#define BLIS_EXTEND_KC_S 0 //(BLIS_DEFAULT_KC_S/4)
|
||||
//#define BLIS_EXTEND_NC_S 0 //(BLIS_DEFAULT_NC_S/4)
|
||||
|
||||
//#define BLIS_EXTEND_MC_D 0 //(BLIS_DEFAULT_MC_D/4)
|
||||
//#define BLIS_EXTEND_KC_D 0 //(BLIS_DEFAULT_KC_D/4)
|
||||
//#define BLIS_EXTEND_NC_D 0 //(BLIS_DEFAULT_NC_D/4)
|
||||
|
||||
//#define BLIS_EXTEND_MC_C 0 //(BLIS_DEFAULT_MC_C/4)
|
||||
//#define BLIS_EXTEND_KC_C 0 //(BLIS_DEFAULT_KC_C/4)
|
||||
//#define BLIS_EXTEND_NC_C 0 //(BLIS_DEFAULT_NC_C/4)
|
||||
|
||||
//#define BLIS_EXTEND_MC_Z 0 //(BLIS_DEFAULT_MC_Z/4)
|
||||
//#define BLIS_EXTEND_KC_Z 0 //(BLIS_DEFAULT_KC_Z/4)
|
||||
//#define BLIS_EXTEND_NC_Z 0 //(BLIS_DEFAULT_NC_Z/4)
|
||||
|
||||
// -- Register blocksize extensions (for packed micro-panels) --
|
||||
|
||||
@@ -133,48 +129,22 @@
|
||||
// leading dimensions used within the packed micro-panels are equal to
|
||||
// or greater than their corresponding register blocksizes above.
|
||||
|
||||
#define BLIS_EXTEND_MR_S 0
|
||||
#define BLIS_EXTEND_NR_S 0
|
||||
//#define BLIS_EXTEND_MR_S 0
|
||||
//#define BLIS_EXTEND_NR_S 0
|
||||
|
||||
#define BLIS_EXTEND_MR_D 0
|
||||
#define BLIS_EXTEND_NR_D 0
|
||||
//#define BLIS_EXTEND_MR_D 0
|
||||
//#define BLIS_EXTEND_NR_D 0
|
||||
|
||||
#define BLIS_EXTEND_MR_C 0
|
||||
#define BLIS_EXTEND_NR_C 0
|
||||
//#define BLIS_EXTEND_MR_C 0
|
||||
//#define BLIS_EXTEND_NR_C 0
|
||||
|
||||
#define BLIS_EXTEND_MR_Z 0
|
||||
#define BLIS_EXTEND_NR_Z 0
|
||||
|
||||
// Register blocksize extensions in the k dimension are not used.
|
||||
|
||||
#define BLIS_EXTEND_KR_S 0
|
||||
#define BLIS_EXTEND_KR_D 0
|
||||
#define BLIS_EXTEND_KR_C 0
|
||||
#define BLIS_EXTEND_KR_Z 0
|
||||
//#define BLIS_EXTEND_MR_Z 0
|
||||
//#define BLIS_EXTEND_NR_Z 0
|
||||
|
||||
|
||||
|
||||
// -- LEVEL-2 KERNEL CONSTANTS -------------------------------------------------
|
||||
|
||||
// NOTE: These values determine high-level cache blocking for level-2
|
||||
// operations ONLY. So, if gemv is performed with a 2000x2000 matrix A and
|
||||
// MC = NC = 1000, then a total of four unblocked (or unblocked fused)
|
||||
// gemv subproblems are called. The blocked algorithms are only useful in
|
||||
// that they provide the opportunity for packing vectors. (Matrices can also
|
||||
// be packed here, but this tends to be much too expensive in practice to
|
||||
// actually employ.)
|
||||
|
||||
#define BLIS_DEFAULT_L2_MC_S 1000
|
||||
#define BLIS_DEFAULT_L2_NC_S 1000
|
||||
|
||||
#define BLIS_DEFAULT_L2_MC_D 1000
|
||||
#define BLIS_DEFAULT_L2_NC_D 1000
|
||||
|
||||
#define BLIS_DEFAULT_L2_MC_C 1000
|
||||
#define BLIS_DEFAULT_L2_NC_C 1000
|
||||
|
||||
#define BLIS_DEFAULT_L2_MC_Z 1000
|
||||
#define BLIS_DEFAULT_L2_NC_Z 1000
|
||||
|
||||
|
||||
|
||||
@@ -182,44 +152,26 @@
|
||||
|
||||
// -- Default fusing factors for level-1f operations --
|
||||
|
||||
// NOTE: Default fusing factors are not used by the reference implementations
|
||||
// of level-1f operations. They are here only for use when these operations
|
||||
// are optimized.
|
||||
#define BLIS_L1F_FUSE_FAC_S 8
|
||||
#define BLIS_L1F_FUSE_FAC_D 4
|
||||
#define BLIS_L1F_FUSE_FAC_C 4
|
||||
#define BLIS_L1F_FUSE_FAC_Z 2
|
||||
|
||||
#define BLIS_DEFAULT_FUSE_FAC_S 8
|
||||
#define BLIS_DEFAULT_FUSE_FAC_D 4
|
||||
#define BLIS_DEFAULT_FUSE_FAC_C 4
|
||||
#define BLIS_DEFAULT_FUSE_FAC_Z 2
|
||||
#define BLIS_AXPYF_FUSE_FAC_S BLIS_L1F_FUSE_FAC_S
|
||||
#define BLIS_AXPYF_FUSE_FAC_D BLIS_L1F_FUSE_FAC_D
|
||||
#define BLIS_AXPYF_FUSE_FAC_C BLIS_L1F_FUSE_FAC_C
|
||||
#define BLIS_AXPYF_FUSE_FAC_Z BLIS_L1F_FUSE_FAC_Z
|
||||
|
||||
#define BLIS_AXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
|
||||
#define BLIS_AXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
|
||||
#define BLIS_AXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
|
||||
#define BLIS_AXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
|
||||
#define BLIS_DOTXF_FUSE_FAC_S BLIS_L1F_FUSE_FAC_S
|
||||
#define BLIS_DOTXF_FUSE_FAC_D BLIS_L1F_FUSE_FAC_D
|
||||
#define BLIS_DOTXF_FUSE_FAC_C BLIS_L1F_FUSE_FAC_C
|
||||
#define BLIS_DOTXF_FUSE_FAC_Z BLIS_L1F_FUSE_FAC_Z
|
||||
|
||||
#define BLIS_DOTXF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
|
||||
#define BLIS_DOTXF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
|
||||
#define BLIS_DOTXF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
|
||||
#define BLIS_DOTXF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_S BLIS_L1F_FUSE_FAC_S
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_D BLIS_L1F_FUSE_FAC_D
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_C BLIS_L1F_FUSE_FAC_C
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_Z BLIS_L1F_FUSE_FAC_Z
|
||||
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
|
||||
|
||||
|
||||
|
||||
// -- LEVEL-1V KERNEL CONSTANTS ------------------------------------------------
|
||||
|
||||
// -- Default register blocksizes for vectors --
|
||||
|
||||
// NOTE: Register blocksizes for vectors are used when packing
|
||||
// non-contiguous vectors. Similar to that of KR, they can
|
||||
// typically be set to 1.
|
||||
|
||||
#define BLIS_DEFAULT_VR_S 1
|
||||
#define BLIS_DEFAULT_VR_D 1
|
||||
#define BLIS_DEFAULT_VR_C 1
|
||||
#define BLIS_DEFAULT_VR_Z 1
|
||||
|
||||
|
||||
|
||||
@@ -229,16 +181,11 @@
|
||||
|
||||
#include "bli_gemm_8x8.h"
|
||||
|
||||
#define GEMM_UKERNEL gemm_8x8
|
||||
#define GEMM_UKERNEL_MT gemm_8x8_mt
|
||||
#define BLIS_DGEMM_UKERNEL bli_dgemm_8x8
|
||||
#define BLIS_DGEMM_UKERNEL_MT bli_dgemm_8x8_mt
|
||||
|
||||
// -- trsm-related --
|
||||
|
||||
#define GEMMTRSM_L_UKERNEL gemmtrsm_l_ref_mxn
|
||||
#define GEMMTRSM_U_UKERNEL gemmtrsm_u_ref_mxn
|
||||
|
||||
#define TRSM_L_UKERNEL trsm_l_ref_mxn
|
||||
#define TRSM_U_UKERNEL trsm_u_ref_mxn
|
||||
|
||||
|
||||
|
||||
@@ -246,25 +193,8 @@
|
||||
|
||||
// -- packm --
|
||||
|
||||
#define PACKM_2XK_KERNEL packm_ref_2xk
|
||||
#define PACKM_4XK_KERNEL packm_ref_4xk
|
||||
#define PACKM_6XK_KERNEL packm_ref_6xk
|
||||
#define PACKM_8XK_KERNEL packm_ref_8xk
|
||||
#define PACKM_10XK_KERNEL packm_ref_10xk
|
||||
#define PACKM_12XK_KERNEL packm_ref_12xk
|
||||
#define PACKM_14XK_KERNEL packm_ref_14xk
|
||||
#define PACKM_16XK_KERNEL packm_ref_16xk
|
||||
|
||||
// -- unpackm --
|
||||
|
||||
#define UNPACKM_2XK_KERNEL unpackm_ref_2xk
|
||||
#define UNPACKM_4XK_KERNEL unpackm_ref_4xk
|
||||
#define UNPACKM_6XK_KERNEL unpackm_ref_6xk
|
||||
#define UNPACKM_8XK_KERNEL unpackm_ref_8xk
|
||||
#define UNPACKM_10XK_KERNEL unpackm_ref_10xk
|
||||
#define UNPACKM_12XK_KERNEL unpackm_ref_12xk
|
||||
#define UNPACKM_14XK_KERNEL unpackm_ref_14xk
|
||||
#define UNPACKM_16XK_KERNEL unpackm_ref_16xk
|
||||
|
||||
|
||||
|
||||
@@ -272,25 +202,16 @@
|
||||
|
||||
// -- axpy2v --
|
||||
|
||||
#define AXPY2V_KERNEL axpy2v_unb_var1
|
||||
|
||||
// -- dotaxpyv --
|
||||
|
||||
#define DOTAXPYV_KERNEL dotaxpyv_unb_var1
|
||||
|
||||
// -- axpyf --
|
||||
|
||||
#include "bli_axpyf_opt_var1.h"
|
||||
|
||||
#define AXPYF_KERNEL axpyf_opt_var1
|
||||
#define BLIS_DAXPYF_KERNEL bli_daxpyf_opt_var1
|
||||
|
||||
// -- dotxf --
|
||||
|
||||
#define DOTXF_KERNEL dotxf_unb_var1
|
||||
|
||||
// -- dotxaxpyf --
|
||||
|
||||
#define DOTXAXPYF_KERNEL dotxaxpyf_unb_var1
|
||||
|
||||
|
||||
|
||||
@@ -298,52 +219,30 @@
|
||||
|
||||
// -- addv --
|
||||
|
||||
#define ADDV_KERNEL addv_unb_var1
|
||||
|
||||
// -- axpyv --
|
||||
|
||||
#include "bli_axpyv_opt_var1.h"
|
||||
|
||||
#define AXPYV_KERNEL axpyv_opt_var1
|
||||
#define BLIS_DAXPYV_KERNEL bli_daxpyv_opt_var1
|
||||
|
||||
// -- copyv --
|
||||
|
||||
#define COPYV_KERNEL copyv_unb_var1
|
||||
|
||||
// -- dotv --
|
||||
|
||||
#include "bli_dotv_opt_var1.h"
|
||||
|
||||
#define DOTV_KERNEL dotv_opt_var1
|
||||
#define BLIS_DDOTV_KERNEL bli_ddotv_opt_var1
|
||||
|
||||
// -- dotxv --
|
||||
|
||||
#define DOTXV_KERNEL dotxv_unb_var1
|
||||
|
||||
// -- invertv --
|
||||
|
||||
#define INVERTV_KERNEL invertv_unb_var1
|
||||
|
||||
// -- scal2v --
|
||||
|
||||
#define SCAL2V_KERNEL scal2v_unb_var1
|
||||
|
||||
// -- scalv --
|
||||
|
||||
#define SCALV_KERNEL scalv_unb_var1
|
||||
|
||||
// -- setv --
|
||||
|
||||
#define SETV_KERNEL setv_unb_var1
|
||||
|
||||
// -- subv --
|
||||
|
||||
#define SUBV_KERNEL subv_unb_var1
|
||||
|
||||
// -- swapv --
|
||||
|
||||
#define SWAPV_KERNEL swapv_unb_var1
|
||||
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
@@ -38,7 +38,7 @@
|
||||
|
||||
// -- LEVEL-3 MICRO-KERNEL CONSTANTS -------------------------------------------
|
||||
|
||||
// -- Default cache blocksizes --
|
||||
// -- Cache blocksizes --
|
||||
|
||||
//
|
||||
// Constraints:
|
||||
@@ -70,35 +70,7 @@
|
||||
#define BLIS_DEFAULT_KC_Z 128
|
||||
#define BLIS_DEFAULT_NC_Z 4096
|
||||
|
||||
// -- Cache blocksize extensions (for optimizing edge cases) --
|
||||
|
||||
// NOTE: These cache blocksize "extensions" have the same constraints as
|
||||
// the corresponding default blocksizes above. When these values are
|
||||
// non-zero, blocksizes used at edge cases are extended (enlarged) if
|
||||
// such an extension would encompass the remaining portion of the
|
||||
// matrix dimension.
|
||||
|
||||
#define BLIS_EXTEND_MC_S 0 //(BLIS_DEFAULT_MC_S/4)
|
||||
#define BLIS_EXTEND_KC_S 0 //(BLIS_DEFAULT_KC_S/4)
|
||||
#define BLIS_EXTEND_NC_S 0 //(BLIS_DEFAULT_NC_S/4)
|
||||
|
||||
#define BLIS_EXTEND_MC_D 0 //(BLIS_DEFAULT_MC_D/4)
|
||||
#define BLIS_EXTEND_KC_D 0 //(BLIS_DEFAULT_KC_D/4)
|
||||
#define BLIS_EXTEND_NC_D 0 //(BLIS_DEFAULT_NC_D/4)
|
||||
|
||||
#define BLIS_EXTEND_MC_C 0 //(BLIS_DEFAULT_MC_C/4)
|
||||
#define BLIS_EXTEND_KC_C 0 //(BLIS_DEFAULT_KC_C/4)
|
||||
#define BLIS_EXTEND_NC_C 0 //(BLIS_DEFAULT_NC_C/4)
|
||||
|
||||
#define BLIS_EXTEND_MC_Z 0 //(BLIS_DEFAULT_MC_Z/4)
|
||||
#define BLIS_EXTEND_KC_Z 0 //(BLIS_DEFAULT_KC_Z/4)
|
||||
#define BLIS_EXTEND_NC_Z 0 //(BLIS_DEFAULT_NC_Z/4)
|
||||
|
||||
// -- Default register blocksizes for micro-kernel --
|
||||
|
||||
// NOTE: When using the reference configuration, these register blocksizes
|
||||
// in the m and n dimensions should all be equal to the size expected by
|
||||
// the reference micro-kernel(s).
|
||||
// -- Register blocksizes --
|
||||
|
||||
#define BLIS_DEFAULT_MR_S 4
|
||||
#define BLIS_DEFAULT_NR_S 4
|
||||
@@ -116,10 +88,34 @@
|
||||
// of f, handles leftover edge cases (ie: when k % f > 0) then these
|
||||
// register blocksizes in the k dimension can be defined to 1.
|
||||
|
||||
#define BLIS_DEFAULT_KR_S 1
|
||||
#define BLIS_DEFAULT_KR_D 1
|
||||
#define BLIS_DEFAULT_KR_C 1
|
||||
#define BLIS_DEFAULT_KR_Z 1
|
||||
//#define BLIS_DEFAULT_KR_S 1
|
||||
//#define BLIS_DEFAULT_KR_D 1
|
||||
//#define BLIS_DEFAULT_KR_C 1
|
||||
//#define BLIS_DEFAULT_KR_Z 1
|
||||
|
||||
// -- Cache blocksize extensions (for optimizing edge cases) --
|
||||
|
||||
// NOTE: These cache blocksize "extensions" have the same constraints as
|
||||
// the corresponding default blocksizes above. When these values are
|
||||
// non-zero, blocksizes used at edge cases are extended (enlarged) if
|
||||
// such an extension would encompass the remaining portion of the
|
||||
// matrix dimension.
|
||||
|
||||
//#define BLIS_EXTEND_MC_S 0 //(BLIS_DEFAULT_MC_S/4)
|
||||
//#define BLIS_EXTEND_KC_S 0 //(BLIS_DEFAULT_KC_S/4)
|
||||
//#define BLIS_EXTEND_NC_S 0 //(BLIS_DEFAULT_NC_S/4)
|
||||
|
||||
//#define BLIS_EXTEND_MC_D 0 //(BLIS_DEFAULT_MC_D/4)
|
||||
//#define BLIS_EXTEND_KC_D 0 //(BLIS_DEFAULT_KC_D/4)
|
||||
//#define BLIS_EXTEND_NC_D 0 //(BLIS_DEFAULT_NC_D/4)
|
||||
|
||||
//#define BLIS_EXTEND_MC_C 0 //(BLIS_DEFAULT_MC_C/4)
|
||||
//#define BLIS_EXTEND_KC_C 0 //(BLIS_DEFAULT_KC_C/4)
|
||||
//#define BLIS_EXTEND_NC_C 0 //(BLIS_DEFAULT_NC_C/4)
|
||||
|
||||
//#define BLIS_EXTEND_MC_Z 0 //(BLIS_DEFAULT_MC_Z/4)
|
||||
//#define BLIS_EXTEND_KC_Z 0 //(BLIS_DEFAULT_KC_Z/4)
|
||||
//#define BLIS_EXTEND_NC_Z 0 //(BLIS_DEFAULT_NC_Z/4)
|
||||
|
||||
// -- Register blocksize extensions (for packed micro-panels) --
|
||||
|
||||
@@ -127,93 +123,27 @@
|
||||
// leading dimensions used within the packed micro-panels are equal to
|
||||
// or greater than their corresponding register blocksizes above.
|
||||
|
||||
#define BLIS_EXTEND_MR_S 0
|
||||
#define BLIS_EXTEND_NR_S 0
|
||||
//#define BLIS_EXTEND_MR_S 0
|
||||
//#define BLIS_EXTEND_NR_S 0
|
||||
|
||||
#define BLIS_EXTEND_MR_D 0
|
||||
#define BLIS_EXTEND_NR_D 0
|
||||
//#define BLIS_EXTEND_MR_D 0
|
||||
//#define BLIS_EXTEND_NR_D 0
|
||||
|
||||
#define BLIS_EXTEND_MR_C 0
|
||||
#define BLIS_EXTEND_NR_C 0
|
||||
//#define BLIS_EXTEND_MR_C 0
|
||||
//#define BLIS_EXTEND_NR_C 0
|
||||
|
||||
#define BLIS_EXTEND_MR_Z 0
|
||||
#define BLIS_EXTEND_NR_Z 0
|
||||
|
||||
// Register blocksize extensions in the k dimension are not used.
|
||||
|
||||
#define BLIS_EXTEND_KR_S 0
|
||||
#define BLIS_EXTEND_KR_D 0
|
||||
#define BLIS_EXTEND_KR_C 0
|
||||
#define BLIS_EXTEND_KR_Z 0
|
||||
//#define BLIS_EXTEND_MR_Z 0
|
||||
//#define BLIS_EXTEND_NR_Z 0
|
||||
|
||||
|
||||
|
||||
// -- LEVEL-2 KERNEL CONSTANTS -------------------------------------------------
|
||||
|
||||
// NOTE: These values determine high-level cache blocking for level-2
|
||||
// operations ONLY. So, if gemv is performed with a 2000x2000 matrix A and
|
||||
// MC = NC = 1000, then a total of four unblocked (or unblocked fused)
|
||||
// gemv subproblems are called. The blocked algorithms are only useful in
|
||||
// that they provide the opportunity for packing vectors. (Matrices can also
|
||||
// be packed here, but this tends to be much too expensive in practice to
|
||||
// actually employ.)
|
||||
|
||||
#define BLIS_DEFAULT_L2_MC_S 1000
|
||||
#define BLIS_DEFAULT_L2_NC_S 1000
|
||||
|
||||
#define BLIS_DEFAULT_L2_MC_D 1000
|
||||
#define BLIS_DEFAULT_L2_NC_D 1000
|
||||
|
||||
#define BLIS_DEFAULT_L2_MC_C 1000
|
||||
#define BLIS_DEFAULT_L2_NC_C 1000
|
||||
|
||||
#define BLIS_DEFAULT_L2_MC_Z 1000
|
||||
#define BLIS_DEFAULT_L2_NC_Z 1000
|
||||
|
||||
|
||||
|
||||
// -- LEVEL-1F KERNEL CONSTANTS ------------------------------------------------
|
||||
|
||||
// -- Default fusing factors for level-1f operations --
|
||||
|
||||
// NOTE: Default fusing factors are not used by the reference implementations
|
||||
// of level-1f operations. They are here only for use when these operations
|
||||
// are optimized.
|
||||
|
||||
#define BLIS_DEFAULT_FUSE_FAC_S 8
|
||||
#define BLIS_DEFAULT_FUSE_FAC_D 4
|
||||
#define BLIS_DEFAULT_FUSE_FAC_C 4
|
||||
#define BLIS_DEFAULT_FUSE_FAC_Z 2
|
||||
|
||||
#define BLIS_AXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
|
||||
#define BLIS_AXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
|
||||
#define BLIS_AXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
|
||||
#define BLIS_AXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
|
||||
|
||||
#define BLIS_DOTXF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
|
||||
#define BLIS_DOTXF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
|
||||
#define BLIS_DOTXF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
|
||||
#define BLIS_DOTXF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
|
||||
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
|
||||
|
||||
|
||||
|
||||
// -- LEVEL-1V KERNEL CONSTANTS ------------------------------------------------
|
||||
|
||||
// -- Default register blocksizes for vectors --
|
||||
|
||||
// NOTE: Register blocksizes for vectors are used when packing
|
||||
// non-contiguous vectors. Similar to that of KR, they can
|
||||
// typically be set to 1.
|
||||
|
||||
#define BLIS_DEFAULT_VR_S 1
|
||||
#define BLIS_DEFAULT_VR_D 1
|
||||
#define BLIS_DEFAULT_VR_C 1
|
||||
#define BLIS_DEFAULT_VR_Z 1
|
||||
|
||||
|
||||
|
||||
@@ -221,16 +151,11 @@
|
||||
|
||||
// -- gemm --
|
||||
|
||||
#include "bli_gemm_opt_4x4.h"
|
||||
#define GEMM_UKERNEL gemm_opt_4x4
|
||||
#define BLIS_SGEMM_UKERNEL bli_sgemm_opt_4x4
|
||||
#define BLIS_DGEMM_UKERNEL bli_dgemm_opt_4x4
|
||||
|
||||
// -- trsm-related --
|
||||
|
||||
#define GEMMTRSM_L_UKERNEL gemmtrsm_l_ref_mxn
|
||||
#define GEMMTRSM_U_UKERNEL gemmtrsm_u_ref_mxn
|
||||
|
||||
#define TRSM_L_UKERNEL trsm_l_ref_mxn
|
||||
#define TRSM_U_UKERNEL trsm_u_ref_mxn
|
||||
|
||||
|
||||
|
||||
@@ -238,25 +163,8 @@
|
||||
|
||||
// -- packm --
|
||||
|
||||
#define PACKM_2XK_KERNEL packm_ref_2xk
|
||||
#define PACKM_4XK_KERNEL packm_ref_4xk
|
||||
#define PACKM_6XK_KERNEL packm_ref_6xk
|
||||
#define PACKM_8XK_KERNEL packm_ref_8xk
|
||||
#define PACKM_10XK_KERNEL packm_ref_10xk
|
||||
#define PACKM_12XK_KERNEL packm_ref_12xk
|
||||
#define PACKM_14XK_KERNEL packm_ref_14xk
|
||||
#define PACKM_16XK_KERNEL packm_ref_16xk
|
||||
|
||||
// -- unpackm --
|
||||
|
||||
#define UNPACKM_2XK_KERNEL unpackm_ref_2xk
|
||||
#define UNPACKM_4XK_KERNEL unpackm_ref_4xk
|
||||
#define UNPACKM_6XK_KERNEL unpackm_ref_6xk
|
||||
#define UNPACKM_8XK_KERNEL unpackm_ref_8xk
|
||||
#define UNPACKM_10XK_KERNEL unpackm_ref_10xk
|
||||
#define UNPACKM_12XK_KERNEL unpackm_ref_12xk
|
||||
#define UNPACKM_14XK_KERNEL unpackm_ref_14xk
|
||||
#define UNPACKM_16XK_KERNEL unpackm_ref_16xk
|
||||
|
||||
|
||||
|
||||
@@ -264,23 +172,14 @@
|
||||
|
||||
// -- axpy2v --
|
||||
|
||||
#define AXPY2V_KERNEL axpy2v_unb_var1
|
||||
|
||||
// -- dotaxpyv --
|
||||
|
||||
#define DOTAXPYV_KERNEL dotaxpyv_unb_var1
|
||||
|
||||
// -- axpyf --
|
||||
|
||||
#define AXPYF_KERNEL axpyf_unb_var1
|
||||
|
||||
// -- dotxf --
|
||||
|
||||
#define DOTXF_KERNEL dotxf_unb_var1
|
||||
|
||||
// -- dotxaxpyf --
|
||||
|
||||
#define DOTXAXPYF_KERNEL dotxaxpyf_unb_var1
|
||||
|
||||
|
||||
|
||||
@@ -288,48 +187,26 @@
|
||||
|
||||
// -- addv --
|
||||
|
||||
#define ADDV_KERNEL addv_unb_var1
|
||||
|
||||
// -- axpyv --
|
||||
|
||||
#define AXPYV_KERNEL axpyv_unb_var1
|
||||
|
||||
// -- copyv --
|
||||
|
||||
#define COPYV_KERNEL copyv_unb_var1
|
||||
|
||||
// -- dotv --
|
||||
|
||||
#define DOTV_KERNEL dotv_unb_var1
|
||||
|
||||
// -- dotxv --
|
||||
|
||||
#define DOTXV_KERNEL dotxv_unb_var1
|
||||
|
||||
// -- invertv --
|
||||
|
||||
#define INVERTV_KERNEL invertv_unb_var1
|
||||
|
||||
// -- scal2v --
|
||||
|
||||
#define SCAL2V_KERNEL scal2v_unb_var1
|
||||
|
||||
// -- scalv --
|
||||
|
||||
#define SCALV_KERNEL scalv_unb_var1
|
||||
|
||||
// -- setv --
|
||||
|
||||
#define SETV_KERNEL setv_unb_var1
|
||||
|
||||
// -- subv --
|
||||
|
||||
#define SUBV_KERNEL subv_unb_var1
|
||||
|
||||
// -- swapv --
|
||||
|
||||
#define SWAPV_KERNEL swapv_unb_var1
|
||||
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
@@ -38,7 +38,7 @@
|
||||
|
||||
// -- LEVEL-3 MICRO-KERNEL CONSTANTS -------------------------------------------
|
||||
|
||||
// -- Default cache blocksizes --
|
||||
// -- Cache blocksizes --
|
||||
|
||||
//
|
||||
// Constraints:
|
||||
@@ -70,35 +70,7 @@
|
||||
#define BLIS_DEFAULT_KC_Z 128
|
||||
#define BLIS_DEFAULT_NC_Z 4096
|
||||
|
||||
// -- Cache blocksize extensions (for optimizing edge cases) --
|
||||
|
||||
// NOTE: These cache blocksize "extensions" have the same constraints as
|
||||
// the corresponding default blocksizes above. When these values are
|
||||
// non-zero, blocksizes used at edge cases are extended (enlarged) if
|
||||
// such an extension would encompass the remaining portion of the
|
||||
// matrix dimension.
|
||||
|
||||
#define BLIS_EXTEND_MC_S 0 //(BLIS_DEFAULT_MC_S/4)
|
||||
#define BLIS_EXTEND_KC_S 0 //(BLIS_DEFAULT_KC_S/4)
|
||||
#define BLIS_EXTEND_NC_S 0 //(BLIS_DEFAULT_NC_S/4)
|
||||
|
||||
#define BLIS_EXTEND_MC_D 0 //(BLIS_DEFAULT_MC_D/4)
|
||||
#define BLIS_EXTEND_KC_D 0 //(BLIS_DEFAULT_KC_D/4)
|
||||
#define BLIS_EXTEND_NC_D 0 //(BLIS_DEFAULT_NC_D/4)
|
||||
|
||||
#define BLIS_EXTEND_MC_C 0 //(BLIS_DEFAULT_MC_C/4)
|
||||
#define BLIS_EXTEND_KC_C 0 //(BLIS_DEFAULT_KC_C/4)
|
||||
#define BLIS_EXTEND_NC_C 0 //(BLIS_DEFAULT_NC_C/4)
|
||||
|
||||
#define BLIS_EXTEND_MC_Z 0 //(BLIS_DEFAULT_MC_Z/4)
|
||||
#define BLIS_EXTEND_KC_Z 0 //(BLIS_DEFAULT_KC_Z/4)
|
||||
#define BLIS_EXTEND_NC_Z 0 //(BLIS_DEFAULT_NC_Z/4)
|
||||
|
||||
// -- Default register blocksizes for micro-kernel --
|
||||
|
||||
// NOTE: When using the reference configuration, these register blocksizes
|
||||
// in the m and n dimensions should all be equal to the size expected by
|
||||
// the reference micro-kernel(s).
|
||||
// -- Register blocksizes --
|
||||
|
||||
#define BLIS_DEFAULT_MR_S 4
|
||||
#define BLIS_DEFAULT_NR_S 4
|
||||
@@ -116,10 +88,34 @@
|
||||
// of f, handles leftover edge cases (ie: when k % f > 0) then these
|
||||
// register blocksizes in the k dimension can be defined to 1.
|
||||
|
||||
#define BLIS_DEFAULT_KR_S 1
|
||||
#define BLIS_DEFAULT_KR_D 1
|
||||
#define BLIS_DEFAULT_KR_C 1
|
||||
#define BLIS_DEFAULT_KR_Z 1
|
||||
//#define BLIS_DEFAULT_KR_S 1
|
||||
//#define BLIS_DEFAULT_KR_D 1
|
||||
//#define BLIS_DEFAULT_KR_C 1
|
||||
//#define BLIS_DEFAULT_KR_Z 1
|
||||
|
||||
// -- Cache blocksize extensions (for optimizing edge cases) --
|
||||
|
||||
// NOTE: These cache blocksize "extensions" have the same constraints as
|
||||
// the corresponding default blocksizes above. When these values are
|
||||
// non-zero, blocksizes used at edge cases are extended (enlarged) if
|
||||
// such an extension would encompass the remaining portion of the
|
||||
// matrix dimension.
|
||||
|
||||
//#define BLIS_EXTEND_MC_S 0 //(BLIS_DEFAULT_MC_S/4)
|
||||
//#define BLIS_EXTEND_KC_S 0 //(BLIS_DEFAULT_KC_S/4)
|
||||
//#define BLIS_EXTEND_NC_S 0 //(BLIS_DEFAULT_NC_S/4)
|
||||
|
||||
//#define BLIS_EXTEND_MC_D 0 //(BLIS_DEFAULT_MC_D/4)
|
||||
//#define BLIS_EXTEND_KC_D 0 //(BLIS_DEFAULT_KC_D/4)
|
||||
//#define BLIS_EXTEND_NC_D 0 //(BLIS_DEFAULT_NC_D/4)
|
||||
|
||||
//#define BLIS_EXTEND_MC_C 0 //(BLIS_DEFAULT_MC_C/4)
|
||||
//#define BLIS_EXTEND_KC_C 0 //(BLIS_DEFAULT_KC_C/4)
|
||||
//#define BLIS_EXTEND_NC_C 0 //(BLIS_DEFAULT_NC_C/4)
|
||||
|
||||
//#define BLIS_EXTEND_MC_Z 0 //(BLIS_DEFAULT_MC_Z/4)
|
||||
//#define BLIS_EXTEND_KC_Z 0 //(BLIS_DEFAULT_KC_Z/4)
|
||||
//#define BLIS_EXTEND_NC_Z 0 //(BLIS_DEFAULT_NC_Z/4)
|
||||
|
||||
// -- Register blocksize extensions (for packed micro-panels) --
|
||||
|
||||
@@ -127,93 +123,27 @@
|
||||
// leading dimensions used within the packed micro-panels are equal to
|
||||
// or greater than their corresponding register blocksizes above.
|
||||
|
||||
#define BLIS_EXTEND_MR_S 0
|
||||
#define BLIS_EXTEND_NR_S 0
|
||||
//#define BLIS_EXTEND_MR_S 0
|
||||
//#define BLIS_EXTEND_NR_S 0
|
||||
|
||||
#define BLIS_EXTEND_MR_D 0
|
||||
#define BLIS_EXTEND_NR_D 0
|
||||
//#define BLIS_EXTEND_MR_D 0
|
||||
//#define BLIS_EXTEND_NR_D 0
|
||||
|
||||
#define BLIS_EXTEND_MR_C 0
|
||||
#define BLIS_EXTEND_NR_C 0
|
||||
//#define BLIS_EXTEND_MR_C 0
|
||||
//#define BLIS_EXTEND_NR_C 0
|
||||
|
||||
#define BLIS_EXTEND_MR_Z 0
|
||||
#define BLIS_EXTEND_NR_Z 0
|
||||
|
||||
// Register blocksize extensions in the k dimension are not used.
|
||||
|
||||
#define BLIS_EXTEND_KR_S 0
|
||||
#define BLIS_EXTEND_KR_D 0
|
||||
#define BLIS_EXTEND_KR_C 0
|
||||
#define BLIS_EXTEND_KR_Z 0
|
||||
//#define BLIS_EXTEND_MR_Z 0
|
||||
//#define BLIS_EXTEND_NR_Z 0
|
||||
|
||||
|
||||
|
||||
// -- LEVEL-2 KERNEL CONSTANTS -------------------------------------------------
|
||||
|
||||
// NOTE: These values determine high-level cache blocking for level-2
|
||||
// operations ONLY. So, if gemv is performed with a 2000x2000 matrix A and
|
||||
// MC = NC = 1000, then a total of four unblocked (or unblocked fused)
|
||||
// gemv subproblems are called. The blocked algorithms are only useful in
|
||||
// that they provide the opportunity for packing vectors. (Matrices can also
|
||||
// be packed here, but this tends to be much too expensive in practice to
|
||||
// actually employ.)
|
||||
|
||||
#define BLIS_DEFAULT_L2_MC_S 1000
|
||||
#define BLIS_DEFAULT_L2_NC_S 1000
|
||||
|
||||
#define BLIS_DEFAULT_L2_MC_D 1000
|
||||
#define BLIS_DEFAULT_L2_NC_D 1000
|
||||
|
||||
#define BLIS_DEFAULT_L2_MC_C 1000
|
||||
#define BLIS_DEFAULT_L2_NC_C 1000
|
||||
|
||||
#define BLIS_DEFAULT_L2_MC_Z 1000
|
||||
#define BLIS_DEFAULT_L2_NC_Z 1000
|
||||
|
||||
|
||||
|
||||
// -- LEVEL-1F KERNEL CONSTANTS ------------------------------------------------
|
||||
|
||||
// -- Default fusing factors for level-1f operations --
|
||||
|
||||
// NOTE: Default fusing factors are not used by the reference implementations
|
||||
// of level-1f operations. They are here only for use when these operations
|
||||
// are optimized.
|
||||
|
||||
#define BLIS_DEFAULT_FUSE_FAC_S 8
|
||||
#define BLIS_DEFAULT_FUSE_FAC_D 4
|
||||
#define BLIS_DEFAULT_FUSE_FAC_C 4
|
||||
#define BLIS_DEFAULT_FUSE_FAC_Z 2
|
||||
|
||||
#define BLIS_AXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
|
||||
#define BLIS_AXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
|
||||
#define BLIS_AXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
|
||||
#define BLIS_AXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
|
||||
|
||||
#define BLIS_DOTXF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
|
||||
#define BLIS_DOTXF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
|
||||
#define BLIS_DOTXF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
|
||||
#define BLIS_DOTXF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
|
||||
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
|
||||
|
||||
|
||||
|
||||
// -- LEVEL-1V KERNEL CONSTANTS ------------------------------------------------
|
||||
|
||||
// -- Default register blocksizes for vectors --
|
||||
|
||||
// NOTE: Register blocksizes for vectors are used when packing
|
||||
// non-contiguous vectors. Similar to that of KR, they can
|
||||
// typically be set to 1.
|
||||
|
||||
#define BLIS_DEFAULT_VR_S 1
|
||||
#define BLIS_DEFAULT_VR_D 1
|
||||
#define BLIS_DEFAULT_VR_C 1
|
||||
#define BLIS_DEFAULT_VR_Z 1
|
||||
|
||||
|
||||
|
||||
@@ -221,16 +151,11 @@
|
||||
|
||||
// -- gemm --
|
||||
|
||||
#include "bli_gemm_opt_4x4.h"
|
||||
#define GEMM_UKERNEL gemm_opt_4x4
|
||||
#define BLIS_SGEMM_UKERNEL bli_sgemm_opt_4x4
|
||||
#define BLIS_DGEMM_UKERNEL bli_dgemm_opt_4x4
|
||||
|
||||
// -- trsm-related --
|
||||
|
||||
#define GEMMTRSM_L_UKERNEL gemmtrsm_l_ref_mxn
|
||||
#define GEMMTRSM_U_UKERNEL gemmtrsm_u_ref_mxn
|
||||
|
||||
#define TRSM_L_UKERNEL trsm_l_ref_mxn
|
||||
#define TRSM_U_UKERNEL trsm_u_ref_mxn
|
||||
|
||||
|
||||
|
||||
@@ -238,25 +163,8 @@
|
||||
|
||||
// -- packm --
|
||||
|
||||
#define PACKM_2XK_KERNEL packm_ref_2xk
|
||||
#define PACKM_4XK_KERNEL packm_ref_4xk
|
||||
#define PACKM_6XK_KERNEL packm_ref_6xk
|
||||
#define PACKM_8XK_KERNEL packm_ref_8xk
|
||||
#define PACKM_10XK_KERNEL packm_ref_10xk
|
||||
#define PACKM_12XK_KERNEL packm_ref_12xk
|
||||
#define PACKM_14XK_KERNEL packm_ref_14xk
|
||||
#define PACKM_16XK_KERNEL packm_ref_16xk
|
||||
|
||||
// -- unpackm --
|
||||
|
||||
#define UNPACKM_2XK_KERNEL unpackm_ref_2xk
|
||||
#define UNPACKM_4XK_KERNEL unpackm_ref_4xk
|
||||
#define UNPACKM_6XK_KERNEL unpackm_ref_6xk
|
||||
#define UNPACKM_8XK_KERNEL unpackm_ref_8xk
|
||||
#define UNPACKM_10XK_KERNEL unpackm_ref_10xk
|
||||
#define UNPACKM_12XK_KERNEL unpackm_ref_12xk
|
||||
#define UNPACKM_14XK_KERNEL unpackm_ref_14xk
|
||||
#define UNPACKM_16XK_KERNEL unpackm_ref_16xk
|
||||
|
||||
|
||||
|
||||
@@ -264,23 +172,14 @@
|
||||
|
||||
// -- axpy2v --
|
||||
|
||||
#define AXPY2V_KERNEL axpy2v_unb_var1
|
||||
|
||||
// -- dotaxpyv --
|
||||
|
||||
#define DOTAXPYV_KERNEL dotaxpyv_unb_var1
|
||||
|
||||
// -- axpyf --
|
||||
|
||||
#define AXPYF_KERNEL axpyf_unb_var1
|
||||
|
||||
// -- dotxf --
|
||||
|
||||
#define DOTXF_KERNEL dotxf_unb_var1
|
||||
|
||||
// -- dotxaxpyf --
|
||||
|
||||
#define DOTXAXPYF_KERNEL dotxaxpyf_unb_var1
|
||||
|
||||
|
||||
|
||||
@@ -288,48 +187,26 @@
|
||||
|
||||
// -- addv --
|
||||
|
||||
#define ADDV_KERNEL addv_unb_var1
|
||||
|
||||
// -- axpyv --
|
||||
|
||||
#define AXPYV_KERNEL axpyv_unb_var1
|
||||
|
||||
// -- copyv --
|
||||
|
||||
#define COPYV_KERNEL copyv_unb_var1
|
||||
|
||||
// -- dotv --
|
||||
|
||||
#define DOTV_KERNEL dotv_unb_var1
|
||||
|
||||
// -- dotxv --
|
||||
|
||||
#define DOTXV_KERNEL dotxv_unb_var1
|
||||
|
||||
// -- invertv --
|
||||
|
||||
#define INVERTV_KERNEL invertv_unb_var1
|
||||
|
||||
// -- scal2v --
|
||||
|
||||
#define SCAL2V_KERNEL scal2v_unb_var1
|
||||
|
||||
// -- scalv --
|
||||
|
||||
#define SCALV_KERNEL scalv_unb_var1
|
||||
|
||||
// -- setv --
|
||||
|
||||
#define SETV_KERNEL setv_unb_var1
|
||||
|
||||
// -- subv --
|
||||
|
||||
#define SUBV_KERNEL subv_unb_var1
|
||||
|
||||
// -- swapv --
|
||||
|
||||
#define SWAPV_KERNEL swapv_unb_var1
|
||||
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
@@ -64,14 +64,6 @@
|
||||
// scomplex and dcomplex being defined in terms of simple structs.
|
||||
//#define BLIS_ENABLE_C99_COMPLEX
|
||||
|
||||
// Enable use of the 4m method to implement complex domain level-3
|
||||
// operations. By enabling this option, special code is activiated that
|
||||
// induces complex level-3 operations using ONLY the real domain
|
||||
// micro-kernels. This allows kernel authors to focus on optimizing
|
||||
// the real micro-kernels, which can then be leveraged to provide their
|
||||
// complex counterparts "for free".
|
||||
#define BLIS_ENABLE_COMPLEX_VIA_4M
|
||||
|
||||
|
||||
|
||||
// -- MULTITHREADING -----------------------------------------------------------
|
||||
|
||||
@@ -38,7 +38,7 @@
|
||||
|
||||
// -- LEVEL-3 MICRO-KERNEL CONSTANTS -------------------------------------------
|
||||
|
||||
// -- Default cache blocksizes --
|
||||
// -- Cache blocksizes --
|
||||
|
||||
//
|
||||
// Constraints:
|
||||
@@ -62,13 +62,13 @@
|
||||
#define BLIS_DEFAULT_KC_D 384
|
||||
#define BLIS_DEFAULT_NC_D 4096
|
||||
|
||||
#define BLIS_DEFAULT_MC_C 384
|
||||
#define BLIS_DEFAULT_KC_C 384
|
||||
#define BLIS_DEFAULT_NC_C 4096
|
||||
//#define BLIS_DEFAULT_MC_C 384
|
||||
//#define BLIS_DEFAULT_KC_C 384
|
||||
//#define BLIS_DEFAULT_NC_C 4096
|
||||
|
||||
#define BLIS_DEFAULT_MC_Z 192
|
||||
#define BLIS_DEFAULT_KC_Z 384
|
||||
#define BLIS_DEFAULT_NC_Z 4096
|
||||
//#define BLIS_DEFAULT_MC_Z 192
|
||||
//#define BLIS_DEFAULT_KC_Z 384
|
||||
//#define BLIS_DEFAULT_NC_Z 4096
|
||||
|
||||
// NOTE: If 4m blocksizes are not defined here, they will be determined
|
||||
// from the corresponding real domain blocksizes.
|
||||
@@ -90,35 +90,7 @@
|
||||
#define BLIS_DEFAULT_3M_KC_Z 256
|
||||
#define BLIS_DEFAULT_3M_NC_Z 4096
|
||||
|
||||
// -- Cache blocksize extensions (for optimizing edge cases) --
|
||||
|
||||
// NOTE: These cache blocksize "extensions" have the same constraints as
|
||||
// the corresponding default blocksizes above. When these values are
|
||||
// non-zero, blocksizes used at edge cases are extended (enlarged) if
|
||||
// such an extension would encompass the remaining portion of the
|
||||
// matrix dimension.
|
||||
|
||||
#define BLIS_EXTEND_MC_S 0 //(BLIS_DEFAULT_MC_S/4)
|
||||
#define BLIS_EXTEND_KC_S 0 //(BLIS_DEFAULT_KC_S/4)
|
||||
#define BLIS_EXTEND_NC_S 0 //(BLIS_DEFAULT_NC_S/4)
|
||||
|
||||
#define BLIS_EXTEND_MC_D (BLIS_DEFAULT_MC_D/4)
|
||||
#define BLIS_EXTEND_KC_D (BLIS_DEFAULT_KC_D/4)
|
||||
#define BLIS_EXTEND_NC_D 0 //(BLIS_DEFAULT_NC_D/4)
|
||||
|
||||
#define BLIS_EXTEND_MC_C 0 //(BLIS_DEFAULT_MC_C/4)
|
||||
#define BLIS_EXTEND_KC_C 0 //(BLIS_DEFAULT_KC_C/4)
|
||||
#define BLIS_EXTEND_NC_C 0 //(BLIS_DEFAULT_NC_C/4)
|
||||
|
||||
#define BLIS_EXTEND_MC_Z 0 //(BLIS_DEFAULT_MC_Z/4)
|
||||
#define BLIS_EXTEND_KC_Z 0 //(BLIS_DEFAULT_KC_Z/4)
|
||||
#define BLIS_EXTEND_NC_Z 0 //(BLIS_DEFAULT_NC_Z/4)
|
||||
|
||||
// -- Default register blocksizes for micro-kernel --
|
||||
|
||||
// NOTE: When using the reference configuration, these register blocksizes
|
||||
// in the m and n dimensions should all be equal to the size expected by
|
||||
// the reference micro-kernel(s).
|
||||
// -- Register blocksizes --
|
||||
|
||||
#define BLIS_DEFAULT_MR_S 8
|
||||
#define BLIS_DEFAULT_NR_S 4
|
||||
@@ -136,10 +108,34 @@
|
||||
// of f, handles leftover edge cases (ie: when k % f > 0) then these
|
||||
// register blocksizes in the k dimension can be defined to 1.
|
||||
|
||||
#define BLIS_DEFAULT_KR_S 1
|
||||
#define BLIS_DEFAULT_KR_D 1
|
||||
#define BLIS_DEFAULT_KR_C 1
|
||||
#define BLIS_DEFAULT_KR_Z 1
|
||||
//#define BLIS_DEFAULT_KR_S 1
|
||||
//#define BLIS_DEFAULT_KR_D 1
|
||||
//#define BLIS_DEFAULT_KR_C 1
|
||||
//#define BLIS_DEFAULT_KR_Z 1
|
||||
|
||||
// -- Cache blocksize extensions (for optimizing edge cases) --
|
||||
|
||||
// NOTE: These cache blocksize "extensions" have the same constraints as
|
||||
// the corresponding default blocksizes above. When these values are
|
||||
// non-zero, blocksizes used at edge cases are extended (enlarged) if
|
||||
// such an extension would encompass the remaining portion of the
|
||||
// matrix dimension.
|
||||
|
||||
//#define BLIS_EXTEND_MC_S 0 //(BLIS_DEFAULT_MC_S/4)
|
||||
//#define BLIS_EXTEND_KC_S 0 //(BLIS_DEFAULT_KC_S/4)
|
||||
//#define BLIS_EXTEND_NC_S 0 //(BLIS_DEFAULT_NC_S/4)
|
||||
|
||||
//#define BLIS_EXTEND_MC_D (BLIS_DEFAULT_MC_D/4)
|
||||
//#define BLIS_EXTEND_KC_D (BLIS_DEFAULT_KC_D/4)
|
||||
//#define BLIS_EXTEND_NC_D 0 //(BLIS_DEFAULT_NC_D/4)
|
||||
|
||||
//#define BLIS_EXTEND_MC_C 0 //(BLIS_DEFAULT_MC_C/4)
|
||||
//#define BLIS_EXTEND_KC_C 0 //(BLIS_DEFAULT_KC_C/4)
|
||||
//#define BLIS_EXTEND_NC_C 0 //(BLIS_DEFAULT_NC_C/4)
|
||||
|
||||
//#define BLIS_EXTEND_MC_Z 0 //(BLIS_DEFAULT_MC_Z/4)
|
||||
//#define BLIS_EXTEND_KC_Z 0 //(BLIS_DEFAULT_KC_Z/4)
|
||||
//#define BLIS_EXTEND_NC_Z 0 //(BLIS_DEFAULT_NC_Z/4)
|
||||
|
||||
// -- Register blocksize extensions (for packed micro-panels) --
|
||||
|
||||
@@ -147,234 +143,99 @@
|
||||
// leading dimensions used within the packed micro-panels are equal to
|
||||
// or greater than their corresponding register blocksizes above.
|
||||
|
||||
#define BLIS_EXTEND_MR_S 0
|
||||
#define BLIS_EXTEND_NR_S 0
|
||||
//#define BLIS_EXTEND_MR_S 0
|
||||
//#define BLIS_EXTEND_NR_S 0
|
||||
|
||||
#define BLIS_EXTEND_MR_D 0
|
||||
#define BLIS_EXTEND_NR_D 0
|
||||
//#define BLIS_EXTEND_MR_D 0
|
||||
//#define BLIS_EXTEND_NR_D 0
|
||||
|
||||
#define BLIS_EXTEND_MR_C 0
|
||||
#define BLIS_EXTEND_NR_C 0
|
||||
//#define BLIS_EXTEND_MR_C 0
|
||||
//#define BLIS_EXTEND_NR_C 0
|
||||
|
||||
#define BLIS_EXTEND_MR_Z 0
|
||||
#define BLIS_EXTEND_NR_Z 0
|
||||
//#define BLIS_EXTEND_MR_Z 0
|
||||
//#define BLIS_EXTEND_NR_Z 0
|
||||
|
||||
// Register blocksize extensions in the k dimension are not used.
|
||||
|
||||
#define BLIS_EXTEND_KR_S 0
|
||||
#define BLIS_EXTEND_KR_D 0
|
||||
#define BLIS_EXTEND_KR_C 0
|
||||
#define BLIS_EXTEND_KR_Z 0
|
||||
|
||||
|
||||
|
||||
// -- LEVEL-2 KERNEL CONSTANTS -------------------------------------------------
|
||||
|
||||
// NOTE: These values determine high-level cache blocking for level-2
|
||||
// operations ONLY. So, if gemv is performed with a 2000x2000 matrix A and
|
||||
// MC = NC = 1000, then a total of four unblocked (or unblocked fused)
|
||||
// gemv subproblems are called. The blocked algorithms are only useful in
|
||||
// that they provide the opportunity for packing vectors. (Matrices can also
|
||||
// be packed here, but this tends to be much too expensive in practice to
|
||||
// actually employ.)
|
||||
|
||||
#define BLIS_DEFAULT_L2_MC_S 1000
|
||||
#define BLIS_DEFAULT_L2_NC_S 1000
|
||||
|
||||
#define BLIS_DEFAULT_L2_MC_D 1000
|
||||
#define BLIS_DEFAULT_L2_NC_D 1000
|
||||
|
||||
#define BLIS_DEFAULT_L2_MC_C 1000
|
||||
#define BLIS_DEFAULT_L2_NC_C 1000
|
||||
|
||||
#define BLIS_DEFAULT_L2_MC_Z 1000
|
||||
#define BLIS_DEFAULT_L2_NC_Z 1000
|
||||
|
||||
|
||||
|
||||
// -- LEVEL-1F KERNEL CONSTANTS ------------------------------------------------
|
||||
|
||||
// -- Default fusing factors for level-1f operations --
|
||||
|
||||
// NOTE: Default fusing factors are not used by the reference implementations
|
||||
// of level-1f operations. They are here only for use when these operations
|
||||
// are optimized.
|
||||
|
||||
#define BLIS_DEFAULT_FUSE_FAC_S 8
|
||||
#define BLIS_DEFAULT_FUSE_FAC_D 4
|
||||
#define BLIS_DEFAULT_FUSE_FAC_C 4
|
||||
#define BLIS_DEFAULT_FUSE_FAC_Z 2
|
||||
|
||||
#define BLIS_AXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
|
||||
#define BLIS_AXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
|
||||
#define BLIS_AXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
|
||||
#define BLIS_AXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
|
||||
|
||||
#define BLIS_DOTXF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
|
||||
#define BLIS_DOTXF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
|
||||
#define BLIS_DOTXF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
|
||||
#define BLIS_DOTXF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
|
||||
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
|
||||
|
||||
|
||||
|
||||
// -- LEVEL-1V KERNEL CONSTANTS ------------------------------------------------
|
||||
|
||||
// -- Default register blocksizes for vectors --
|
||||
|
||||
// NOTE: Register blocksizes for vectors are used when packing
|
||||
// non-contiguous vectors. Similar to that of KR, they can
|
||||
// typically be set to 1.
|
||||
|
||||
#define BLIS_DEFAULT_VR_S 1
|
||||
#define BLIS_DEFAULT_VR_D 1
|
||||
#define BLIS_DEFAULT_VR_C 1
|
||||
#define BLIS_DEFAULT_VR_Z 1
|
||||
|
||||
|
||||
|
||||
|
||||
// -- LEVEL-3 KERNEL DEFINITIONS -----------------------------------------------
|
||||
|
||||
#include "bli_gemm_opt_d4x4.h"
|
||||
|
||||
#include "bli_gemmtrsm_l_opt_d4x4.h"
|
||||
#include "bli_gemmtrsm_u_opt_d4x4.h"
|
||||
//#include "bli_gemmtrsm_l_ref_mxn.h"
|
||||
//#include "bli_gemmtrsm_u_ref_mxn.h"
|
||||
|
||||
//#include "bli_trsm_l_ref_4x4.h"
|
||||
//#include "bli_trsm_u_ref_4x4.h"
|
||||
#include "bli_trsm_l_ref_mxn.h"
|
||||
#include "bli_trsm_u_ref_mxn.h"
|
||||
|
||||
// -- gemm --
|
||||
|
||||
#define GEMM_UKERNEL gemm_opt_d4x4
|
||||
#define BLIS_SGEMM_UKERNEL bli_sgemm_opt_8x4
|
||||
#define BLIS_DGEMM_UKERNEL bli_dgemm_opt_4x4
|
||||
|
||||
// -- trsm-related --
|
||||
|
||||
//#define GEMMTRSM_L_UKERNEL gemmtrsm_l_ref_mxn
|
||||
//#define GEMMTRSM_U_UKERNEL gemmtrsm_u_ref_mxn
|
||||
#define GEMMTRSM_L_UKERNEL gemmtrsm_l_opt_d4x4
|
||||
#define GEMMTRSM_U_UKERNEL gemmtrsm_u_opt_d4x4
|
||||
#define BLIS_DGEMMTRSM_L_UKERNEL bli_dgemmtrsm_l_opt_4x4
|
||||
#define BLIS_DGEMMTRSM_U_UKERNEL bli_dgemmtrsm_u_opt_4x4
|
||||
|
||||
//#define TRSM_L_UKERNEL trsm_l_ref_4x4
|
||||
//#define TRSM_U_UKERNEL trsm_u_ref_4x4
|
||||
#define TRSM_L_UKERNEL trsm_l_ref_mxn
|
||||
#define TRSM_U_UKERNEL trsm_u_ref_mxn
|
||||
|
||||
|
||||
|
||||
// -- LEVEL-1M KERNEL DEFINITIONS ----------------------------------------------
|
||||
|
||||
// -- packm --
|
||||
|
||||
#define PACKM_2XK_KERNEL packm_ref_2xk
|
||||
#define PACKM_4XK_KERNEL packm_ref_4xk
|
||||
#define PACKM_6XK_KERNEL packm_ref_6xk
|
||||
#define PACKM_8XK_KERNEL packm_ref_8xk
|
||||
#define PACKM_10XK_KERNEL packm_ref_10xk
|
||||
#define PACKM_12XK_KERNEL packm_ref_12xk
|
||||
#define PACKM_14XK_KERNEL packm_ref_14xk
|
||||
#define PACKM_16XK_KERNEL packm_ref_16xk
|
||||
|
||||
// -- unpackm --
|
||||
|
||||
#define UNPACKM_2XK_KERNEL unpackm_ref_2xk
|
||||
#define UNPACKM_4XK_KERNEL unpackm_ref_4xk
|
||||
#define UNPACKM_6XK_KERNEL unpackm_ref_6xk
|
||||
#define UNPACKM_8XK_KERNEL unpackm_ref_8xk
|
||||
#define UNPACKM_10XK_KERNEL unpackm_ref_10xk
|
||||
#define UNPACKM_12XK_KERNEL unpackm_ref_12xk
|
||||
#define UNPACKM_14XK_KERNEL unpackm_ref_14xk
|
||||
#define UNPACKM_16XK_KERNEL unpackm_ref_16xk
|
||||
|
||||
|
||||
|
||||
// -- LEVEL-1F KERNEL DEFINITIONS ----------------------------------------------
|
||||
|
||||
#include "bli_axpy2v_opt_var1.h"
|
||||
#include "bli_dotaxpyv_opt_var1.h"
|
||||
#include "bli_axpyf_opt_var1.h"
|
||||
#include "bli_dotxf_opt_var1.h"
|
||||
#include "bli_dotxaxpyf_opt_var1.h"
|
||||
|
||||
// -- axpy2v --
|
||||
|
||||
#define AXPY2V_KERNEL axpy2v_opt_var1
|
||||
#define BLIS_DAXPY2V_KERNEL bli_daxpy2v_opt_var1
|
||||
|
||||
// -- dotaxpyv --
|
||||
|
||||
#define DOTAXPYV_KERNEL dotaxpyv_opt_var1
|
||||
#define BLIS_DDOTAXPYV_KERNEL bli_ddotaxpyv_opt_var1
|
||||
|
||||
// -- axpyf --
|
||||
|
||||
#define AXPYF_KERNEL axpyf_opt_var1
|
||||
#define BLIS_DAXPYF_KERNEL bli_daxpyf_opt_var1
|
||||
|
||||
// -- dotxf --
|
||||
|
||||
#define DOTXF_KERNEL dotxf_opt_var1
|
||||
#define BLIS_DDOTXF_KERNEL bli_ddotxf_opt_var1
|
||||
|
||||
// -- dotxaxpyf --
|
||||
|
||||
#define DOTXAXPYF_KERNEL dotxaxpyf_opt_var1
|
||||
#define BLIS_DDOTXAXPYF_KERNEL bli_ddotxaxpyf_opt_var1
|
||||
|
||||
|
||||
|
||||
|
||||
// -- LEVEL-1V KERNEL DEFINITIONS ----------------------------------------------
|
||||
|
||||
#include "bli_axpyv_opt_var1.h"
|
||||
#include "bli_dotv_opt_var1.h"
|
||||
|
||||
// -- addv --
|
||||
|
||||
#define ADDV_KERNEL addv_unb_var1
|
||||
|
||||
// -- axpyv --
|
||||
|
||||
#define AXPYV_KERNEL axpyv_opt_var1
|
||||
#define BLIS_DAXPYV_KERNEL bli_daxpyv_opt_var1
|
||||
|
||||
// -- copyv --
|
||||
|
||||
#define COPYV_KERNEL copyv_unb_var1
|
||||
|
||||
// -- dotv --
|
||||
|
||||
#define DOTV_KERNEL dotv_opt_var1
|
||||
#define BLIS_DDOTV_KERNEL bli_ddotv_opt_var1
|
||||
|
||||
// -- dotxv --
|
||||
|
||||
#define DOTXV_KERNEL dotxv_unb_var1
|
||||
|
||||
// -- invertv --
|
||||
|
||||
#define INVERTV_KERNEL invertv_unb_var1
|
||||
|
||||
// -- scal2v --
|
||||
|
||||
#define SCAL2V_KERNEL scal2v_unb_var1
|
||||
|
||||
// -- scalv --
|
||||
|
||||
#define SCALV_KERNEL scalv_unb_var1
|
||||
|
||||
// -- setv --
|
||||
|
||||
#define SETV_KERNEL setv_unb_var1
|
||||
|
||||
// -- subv --
|
||||
|
||||
#define SUBV_KERNEL subv_unb_var1
|
||||
|
||||
// -- swapv --
|
||||
|
||||
#define SWAPV_KERNEL swapv_unb_var1
|
||||
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
@@ -38,7 +38,7 @@
|
||||
|
||||
// -- LEVEL-3 MICRO-KERNEL CONSTANTS -------------------------------------------
|
||||
|
||||
// -- Default cache blocksizes --
|
||||
// -- Cache blocksizes --
|
||||
|
||||
//
|
||||
// Constraints:
|
||||
@@ -70,35 +70,7 @@
|
||||
#define BLIS_DEFAULT_KC_Z 256
|
||||
#define BLIS_DEFAULT_NC_Z 2048
|
||||
|
||||
// -- Cache blocksize extensions (for optimizing edge cases) --
|
||||
|
||||
// NOTE: These cache blocksize "extensions" have the same constraints as
|
||||
// the corresponding default blocksizes above. When these values are
|
||||
// non-zero, blocksizes used at edge cases are extended (enlarged) if
|
||||
// such an extension would encompass the remaining portion of the
|
||||
// matrix dimension.
|
||||
|
||||
#define BLIS_EXTEND_MC_S 0 //(BLIS_DEFAULT_MC_S/4)
|
||||
#define BLIS_EXTEND_KC_S 0 //(BLIS_DEFAULT_KC_S/4)
|
||||
#define BLIS_EXTEND_NC_S 0 //(BLIS_DEFAULT_NC_S/4)
|
||||
|
||||
#define BLIS_EXTEND_MC_D 0 //(BLIS_DEFAULT_MC_D/4)
|
||||
#define BLIS_EXTEND_KC_D 0 //(BLIS_DEFAULT_KC_D/4)
|
||||
#define BLIS_EXTEND_NC_D 0 //(BLIS_DEFAULT_NC_D/4)
|
||||
|
||||
#define BLIS_EXTEND_MC_C 0 //(BLIS_DEFAULT_MC_C/4)
|
||||
#define BLIS_EXTEND_KC_C 0 //(BLIS_DEFAULT_KC_C/4)
|
||||
#define BLIS_EXTEND_NC_C 0 //(BLIS_DEFAULT_NC_C/4)
|
||||
|
||||
#define BLIS_EXTEND_MC_Z 0 //(BLIS_DEFAULT_MC_Z/4)
|
||||
#define BLIS_EXTEND_KC_Z 0 //(BLIS_DEFAULT_KC_Z/4)
|
||||
#define BLIS_EXTEND_NC_Z 0 //(BLIS_DEFAULT_NC_Z/4)
|
||||
|
||||
// -- Default register blocksizes for micro-kernel --
|
||||
|
||||
// NOTE: When using the reference configuration, these register blocksizes
|
||||
// in the m and n dimensions should all be equal to the size expected by
|
||||
// the reference micro-kernel(s).
|
||||
// -- Register blocksizes --
|
||||
|
||||
#define BLIS_DEFAULT_MR_S 8
|
||||
#define BLIS_DEFAULT_NR_S 4
|
||||
@@ -116,10 +88,34 @@
|
||||
// of f, handles leftover edge cases (ie: when k % f > 0) then these
|
||||
// register blocksizes in the k dimension can be defined to 1.
|
||||
|
||||
#define BLIS_DEFAULT_KR_S 1
|
||||
#define BLIS_DEFAULT_KR_D 1
|
||||
#define BLIS_DEFAULT_KR_C 1
|
||||
#define BLIS_DEFAULT_KR_Z 1
|
||||
//#define BLIS_DEFAULT_KR_S 1
|
||||
//#define BLIS_DEFAULT_KR_D 1
|
||||
//#define BLIS_DEFAULT_KR_C 1
|
||||
//#define BLIS_DEFAULT_KR_Z 1
|
||||
|
||||
// -- Cache blocksize extensions (for optimizing edge cases) --
|
||||
|
||||
// NOTE: These cache blocksize "extensions" have the same constraints as
|
||||
// the corresponding default blocksizes above. When these values are
|
||||
// non-zero, blocksizes used at edge cases are extended (enlarged) if
|
||||
// such an extension would encompass the remaining portion of the
|
||||
// matrix dimension.
|
||||
|
||||
//#define BLIS_EXTEND_MC_S 0 //(BLIS_DEFAULT_MC_S/4)
|
||||
//#define BLIS_EXTEND_KC_S 0 //(BLIS_DEFAULT_KC_S/4)
|
||||
//#define BLIS_EXTEND_NC_S 0 //(BLIS_DEFAULT_NC_S/4)
|
||||
|
||||
//#define BLIS_EXTEND_MC_D 0 //(BLIS_DEFAULT_MC_D/4)
|
||||
//#define BLIS_EXTEND_KC_D 0 //(BLIS_DEFAULT_KC_D/4)
|
||||
//#define BLIS_EXTEND_NC_D 0 //(BLIS_DEFAULT_NC_D/4)
|
||||
|
||||
//#define BLIS_EXTEND_MC_C 0 //(BLIS_DEFAULT_MC_C/4)
|
||||
//#define BLIS_EXTEND_KC_C 0 //(BLIS_DEFAULT_KC_C/4)
|
||||
//#define BLIS_EXTEND_NC_C 0 //(BLIS_DEFAULT_NC_C/4)
|
||||
|
||||
//#define BLIS_EXTEND_MC_Z 0 //(BLIS_DEFAULT_MC_Z/4)
|
||||
//#define BLIS_EXTEND_KC_Z 0 //(BLIS_DEFAULT_KC_Z/4)
|
||||
//#define BLIS_EXTEND_NC_Z 0 //(BLIS_DEFAULT_NC_Z/4)
|
||||
|
||||
// -- Register blocksize extensions (for packed micro-panels) --
|
||||
|
||||
@@ -127,111 +123,39 @@
|
||||
// leading dimensions used within the packed micro-panels are equal to
|
||||
// or greater than their corresponding register blocksizes above.
|
||||
|
||||
#define BLIS_EXTEND_MR_S 0
|
||||
#define BLIS_EXTEND_NR_S 0
|
||||
//#define BLIS_EXTEND_MR_S 0
|
||||
//#define BLIS_EXTEND_NR_S 0
|
||||
|
||||
#define BLIS_EXTEND_MR_D 0
|
||||
#define BLIS_EXTEND_NR_D 0
|
||||
//#define BLIS_EXTEND_MR_D 0
|
||||
//#define BLIS_EXTEND_NR_D 0
|
||||
|
||||
#define BLIS_EXTEND_MR_C 0
|
||||
#define BLIS_EXTEND_NR_C 0
|
||||
//#define BLIS_EXTEND_MR_C 0
|
||||
//#define BLIS_EXTEND_NR_C 0
|
||||
|
||||
#define BLIS_EXTEND_MR_Z 0
|
||||
#define BLIS_EXTEND_NR_Z 0
|
||||
//#define BLIS_EXTEND_MR_Z 0
|
||||
//#define BLIS_EXTEND_NR_Z 0
|
||||
|
||||
// Register blocksize extensions in the k dimension are not used.
|
||||
|
||||
#define BLIS_EXTEND_KR_S 0
|
||||
#define BLIS_EXTEND_KR_D 0
|
||||
#define BLIS_EXTEND_KR_C 0
|
||||
#define BLIS_EXTEND_KR_Z 0
|
||||
|
||||
|
||||
|
||||
// -- LEVEL-2 KERNEL CONSTANTS -------------------------------------------------
|
||||
|
||||
// NOTE: These values determine high-level cache blocking for level-2
|
||||
// operations ONLY. So, if gemv is performed with a 2000x2000 matrix A and
|
||||
// MC = NC = 1000, then a total of four unblocked (or unblocked fused)
|
||||
// gemv subproblems are called. The blocked algorithms are only useful in
|
||||
// that they provide the opportunity for packing vectors. (Matrices can also
|
||||
// be packed here, but this tends to be much too expensive in practice to
|
||||
// actually employ.)
|
||||
|
||||
#define BLIS_DEFAULT_L2_MC_S 1000
|
||||
#define BLIS_DEFAULT_L2_NC_S 1000
|
||||
|
||||
#define BLIS_DEFAULT_L2_MC_D 1000
|
||||
#define BLIS_DEFAULT_L2_NC_D 1000
|
||||
|
||||
#define BLIS_DEFAULT_L2_MC_C 1000
|
||||
#define BLIS_DEFAULT_L2_NC_C 1000
|
||||
|
||||
#define BLIS_DEFAULT_L2_MC_Z 1000
|
||||
#define BLIS_DEFAULT_L2_NC_Z 1000
|
||||
|
||||
|
||||
|
||||
// -- LEVEL-1F KERNEL CONSTANTS ------------------------------------------------
|
||||
|
||||
// -- Default fusing factors for level-1f operations --
|
||||
|
||||
// NOTE: Default fusing factors are not used by the reference implementations
|
||||
// of level-1f operations. They are here only for use when these operations
|
||||
// are optimized.
|
||||
|
||||
#define BLIS_DEFAULT_FUSE_FAC_S 8
|
||||
#define BLIS_DEFAULT_FUSE_FAC_D 4
|
||||
#define BLIS_DEFAULT_FUSE_FAC_C 4
|
||||
#define BLIS_DEFAULT_FUSE_FAC_Z 2
|
||||
|
||||
#define BLIS_AXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
|
||||
#define BLIS_AXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
|
||||
#define BLIS_AXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
|
||||
#define BLIS_AXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
|
||||
|
||||
#define BLIS_DOTXF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
|
||||
#define BLIS_DOTXF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
|
||||
#define BLIS_DOTXF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
|
||||
#define BLIS_DOTXF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
|
||||
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
|
||||
|
||||
|
||||
|
||||
// -- LEVEL-1V KERNEL CONSTANTS ------------------------------------------------
|
||||
|
||||
// -- Default register blocksizes for vectors --
|
||||
|
||||
// NOTE: Register blocksizes for vectors are used when packing
|
||||
// non-contiguous vectors. Similar to that of KR, they can
|
||||
// typically be set to 1.
|
||||
|
||||
#define BLIS_DEFAULT_VR_S 1
|
||||
#define BLIS_DEFAULT_VR_D 1
|
||||
#define BLIS_DEFAULT_VR_C 1
|
||||
#define BLIS_DEFAULT_VR_Z 1
|
||||
|
||||
|
||||
|
||||
// -- LEVEL-3 KERNEL DEFINITIONS -----------------------------------------------
|
||||
|
||||
#include "bli_gemm_opt_d4x4.h"
|
||||
|
||||
// -- gemm --
|
||||
|
||||
#define GEMM_UKERNEL gemm_opt_d4x4
|
||||
#define BLIS_DGEMM_UKERNEL bli_dgemm_opt_d4x4
|
||||
|
||||
// -- trsm-related --
|
||||
|
||||
#define GEMMTRSM_L_UKERNEL gemmtrsm_l_ref_mxn
|
||||
#define GEMMTRSM_U_UKERNEL gemmtrsm_u_ref_mxn
|
||||
|
||||
#define TRSM_L_UKERNEL trsm_l_ref_mxn
|
||||
#define TRSM_U_UKERNEL trsm_u_ref_mxn
|
||||
|
||||
|
||||
|
||||
@@ -239,25 +163,8 @@
|
||||
|
||||
// -- packm --
|
||||
|
||||
#define PACKM_2XK_KERNEL packm_ref_2xk
|
||||
#define PACKM_4XK_KERNEL packm_ref_4xk
|
||||
#define PACKM_6XK_KERNEL packm_ref_6xk
|
||||
#define PACKM_8XK_KERNEL packm_ref_8xk
|
||||
#define PACKM_10XK_KERNEL packm_ref_10xk
|
||||
#define PACKM_12XK_KERNEL packm_ref_12xk
|
||||
#define PACKM_14XK_KERNEL packm_ref_14xk
|
||||
#define PACKM_16XK_KERNEL packm_ref_16xk
|
||||
|
||||
// -- unpackm --
|
||||
|
||||
#define UNPACKM_2XK_KERNEL unpackm_ref_2xk
|
||||
#define UNPACKM_4XK_KERNEL unpackm_ref_4xk
|
||||
#define UNPACKM_6XK_KERNEL unpackm_ref_6xk
|
||||
#define UNPACKM_8XK_KERNEL unpackm_ref_8xk
|
||||
#define UNPACKM_10XK_KERNEL unpackm_ref_10xk
|
||||
#define UNPACKM_12XK_KERNEL unpackm_ref_12xk
|
||||
#define UNPACKM_14XK_KERNEL unpackm_ref_14xk
|
||||
#define UNPACKM_16XK_KERNEL unpackm_ref_16xk
|
||||
|
||||
|
||||
|
||||
@@ -265,23 +172,14 @@
|
||||
|
||||
// -- axpy2v --
|
||||
|
||||
#define AXPY2V_KERNEL axpy2v_unb_var1
|
||||
|
||||
// -- dotaxpyv --
|
||||
|
||||
#define DOTAXPYV_KERNEL dotaxpyv_unb_var1
|
||||
|
||||
// -- axpyf --
|
||||
|
||||
#define AXPYF_KERNEL axpyf_unb_var1
|
||||
|
||||
// -- dotxf --
|
||||
|
||||
#define DOTXF_KERNEL dotxf_unb_var1
|
||||
|
||||
// -- dotxaxpyf --
|
||||
|
||||
#define DOTXAXPYF_KERNEL dotxaxpyf_unb_var1
|
||||
|
||||
|
||||
|
||||
@@ -289,48 +187,26 @@
|
||||
|
||||
// -- addv --
|
||||
|
||||
#define ADDV_KERNEL addv_unb_var1
|
||||
|
||||
// -- axpyv --
|
||||
|
||||
#define AXPYV_KERNEL axpyv_unb_var1
|
||||
|
||||
// -- copyv --
|
||||
|
||||
#define COPYV_KERNEL copyv_unb_var1
|
||||
|
||||
// -- dotv --
|
||||
|
||||
#define DOTV_KERNEL dotv_unb_var1
|
||||
|
||||
// -- dotxv --
|
||||
|
||||
#define DOTXV_KERNEL dotxv_unb_var1
|
||||
|
||||
// -- invertv --
|
||||
|
||||
#define INVERTV_KERNEL invertv_unb_var1
|
||||
|
||||
// -- scal2v --
|
||||
|
||||
#define SCAL2V_KERNEL scal2v_unb_var1
|
||||
|
||||
// -- scalv --
|
||||
|
||||
#define SCALV_KERNEL scalv_unb_var1
|
||||
|
||||
// -- setv --
|
||||
|
||||
#define SETV_KERNEL setv_unb_var1
|
||||
|
||||
// -- subv --
|
||||
|
||||
#define SUBV_KERNEL subv_unb_var1
|
||||
|
||||
// -- swapv --
|
||||
|
||||
#define SWAPV_KERNEL swapv_unb_var1
|
||||
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
@@ -38,7 +38,7 @@
|
||||
|
||||
// -- LEVEL-3 MICRO-KERNEL CONSTANTS -------------------------------------------
|
||||
|
||||
// -- Default cache blocksizes --
|
||||
// -- Cache blocksizes --
|
||||
|
||||
//
|
||||
// Constraints:
|
||||
@@ -70,35 +70,7 @@
|
||||
#define BLIS_DEFAULT_KC_Z 256
|
||||
#define BLIS_DEFAULT_NC_Z 2048
|
||||
|
||||
// -- Cache blocksize extensions (for optimizing edge cases) --
|
||||
|
||||
// NOTE: These cache blocksize "extensions" have the same constraints as
|
||||
// the corresponding default blocksizes above. When these values are
|
||||
// non-zero, blocksizes used at edge cases are extended (enlarged) if
|
||||
// such an extension would encompass the remaining portion of the
|
||||
// matrix dimension.
|
||||
|
||||
#define BLIS_EXTEND_MC_S 0 //(BLIS_DEFAULT_MC_S/4)
|
||||
#define BLIS_EXTEND_KC_S 0 //(BLIS_DEFAULT_KC_S/4)
|
||||
#define BLIS_EXTEND_NC_S 0 //(BLIS_DEFAULT_NC_S/4)
|
||||
|
||||
#define BLIS_EXTEND_MC_D (BLIS_DEFAULT_MC_D/4)
|
||||
#define BLIS_EXTEND_KC_D (BLIS_DEFAULT_KC_D/4)
|
||||
#define BLIS_EXTEND_NC_D 0 //(BLIS_DEFAULT_NC_D/4)
|
||||
|
||||
#define BLIS_EXTEND_MC_C 0 //(BLIS_DEFAULT_MC_C/4)
|
||||
#define BLIS_EXTEND_KC_C 0 //(BLIS_DEFAULT_KC_C/4)
|
||||
#define BLIS_EXTEND_NC_C 0 //(BLIS_DEFAULT_NC_C/4)
|
||||
|
||||
#define BLIS_EXTEND_MC_Z 0 //(BLIS_DEFAULT_MC_Z/4)
|
||||
#define BLIS_EXTEND_KC_Z 0 //(BLIS_DEFAULT_KC_Z/4)
|
||||
#define BLIS_EXTEND_NC_Z 0 //(BLIS_DEFAULT_NC_Z/4)
|
||||
|
||||
// -- Default register blocksizes for micro-kernel --
|
||||
|
||||
// NOTE: When using the reference configuration, these register blocksizes
|
||||
// in the m and n dimensions should all be equal to the size expected by
|
||||
// the reference micro-kernel(s).
|
||||
// -- Register blocksizes --
|
||||
|
||||
#define BLIS_DEFAULT_MR_S 8
|
||||
#define BLIS_DEFAULT_NR_S 4
|
||||
@@ -116,10 +88,34 @@
|
||||
// of f, handles leftover edge cases (ie: when k % f > 0) then these
|
||||
// register blocksizes in the k dimension can be defined to 1.
|
||||
|
||||
#define BLIS_DEFAULT_KR_S 1
|
||||
#define BLIS_DEFAULT_KR_D 1
|
||||
#define BLIS_DEFAULT_KR_C 1
|
||||
#define BLIS_DEFAULT_KR_Z 1
|
||||
//#define BLIS_DEFAULT_KR_S 1
|
||||
//#define BLIS_DEFAULT_KR_D 1
|
||||
//#define BLIS_DEFAULT_KR_C 1
|
||||
//#define BLIS_DEFAULT_KR_Z 1
|
||||
|
||||
// -- Cache blocksize extensions (for optimizing edge cases) --
|
||||
|
||||
// NOTE: These cache blocksize "extensions" have the same constraints as
|
||||
// the corresponding default blocksizes above. When these values are
|
||||
// non-zero, blocksizes used at edge cases are extended (enlarged) if
|
||||
// such an extension would encompass the remaining portion of the
|
||||
// matrix dimension.
|
||||
|
||||
//#define BLIS_EXTEND_MC_S 0 //(BLIS_DEFAULT_MC_S/4)
|
||||
//#define BLIS_EXTEND_KC_S 0 //(BLIS_DEFAULT_KC_S/4)
|
||||
//#define BLIS_EXTEND_NC_S 0 //(BLIS_DEFAULT_NC_S/4)
|
||||
|
||||
#define BLIS_EXTEND_MC_D (BLIS_DEFAULT_MC_D/4)
|
||||
#define BLIS_EXTEND_KC_D (BLIS_DEFAULT_KC_D/4)
|
||||
#define BLIS_EXTEND_NC_D 0 //(BLIS_DEFAULT_NC_D/4)
|
||||
|
||||
//#define BLIS_EXTEND_MC_C 0 //(BLIS_DEFAULT_MC_C/4)
|
||||
//#define BLIS_EXTEND_KC_C 0 //(BLIS_DEFAULT_KC_C/4)
|
||||
//#define BLIS_EXTEND_NC_C 0 //(BLIS_DEFAULT_NC_C/4)
|
||||
|
||||
//#define BLIS_EXTEND_MC_Z 0 //(BLIS_DEFAULT_MC_Z/4)
|
||||
//#define BLIS_EXTEND_KC_Z 0 //(BLIS_DEFAULT_KC_Z/4)
|
||||
//#define BLIS_EXTEND_NC_Z 0 //(BLIS_DEFAULT_NC_Z/4)
|
||||
|
||||
// -- Register blocksize extensions (for packed micro-panels) --
|
||||
|
||||
@@ -127,93 +123,28 @@
|
||||
// leading dimensions used within the packed micro-panels are equal to
|
||||
// or greater than their corresponding register blocksizes above.
|
||||
|
||||
#define BLIS_EXTEND_MR_S 0
|
||||
#define BLIS_EXTEND_NR_S 0
|
||||
//#define BLIS_EXTEND_MR_S 0
|
||||
//#define BLIS_EXTEND_NR_S 0
|
||||
|
||||
#define BLIS_EXTEND_MR_D 2
|
||||
#define BLIS_EXTEND_NR_D 0
|
||||
|
||||
#define BLIS_EXTEND_MR_C 0
|
||||
#define BLIS_EXTEND_NR_C 0
|
||||
//#define BLIS_EXTEND_MR_C 0
|
||||
//#define BLIS_EXTEND_NR_C 0
|
||||
|
||||
#define BLIS_EXTEND_MR_Z 0
|
||||
#define BLIS_EXTEND_NR_Z 0
|
||||
//#define BLIS_EXTEND_MR_Z 0
|
||||
//#define BLIS_EXTEND_NR_Z 0
|
||||
|
||||
// Register blocksize extensions in the k dimension are not used.
|
||||
|
||||
#define BLIS_EXTEND_KR_S 0
|
||||
#define BLIS_EXTEND_KR_D 0
|
||||
#define BLIS_EXTEND_KR_C 0
|
||||
#define BLIS_EXTEND_KR_Z 0
|
||||
|
||||
|
||||
|
||||
// -- LEVEL-2 KERNEL CONSTANTS -------------------------------------------------
|
||||
|
||||
// NOTE: These values determine high-level cache blocking for level-2
|
||||
// operations ONLY. So, if gemv is performed with a 2000x2000 matrix A and
|
||||
// MC = NC = 1000, then a total of four unblocked (or unblocked fused)
|
||||
// gemv subproblems are called. The blocked algorithms are only useful in
|
||||
// that they provide the opportunity for packing vectors. (Matrices can also
|
||||
// be packed here, but this tends to be much too expensive in practice to
|
||||
// actually employ.)
|
||||
|
||||
#define BLIS_DEFAULT_L2_MC_S 1000
|
||||
#define BLIS_DEFAULT_L2_NC_S 1000
|
||||
|
||||
#define BLIS_DEFAULT_L2_MC_D 1000
|
||||
#define BLIS_DEFAULT_L2_NC_D 1000
|
||||
|
||||
#define BLIS_DEFAULT_L2_MC_C 1000
|
||||
#define BLIS_DEFAULT_L2_NC_C 1000
|
||||
|
||||
#define BLIS_DEFAULT_L2_MC_Z 1000
|
||||
#define BLIS_DEFAULT_L2_NC_Z 1000
|
||||
|
||||
|
||||
|
||||
// -- LEVEL-1F KERNEL CONSTANTS ------------------------------------------------
|
||||
|
||||
// -- Default fusing factors for level-1f operations --
|
||||
|
||||
// NOTE: Default fusing factors are not used by the reference implementations
|
||||
// of level-1f operations. They are here only for use when these operations
|
||||
// are optimized.
|
||||
|
||||
#define BLIS_DEFAULT_FUSE_FAC_S 8
|
||||
#define BLIS_DEFAULT_FUSE_FAC_D 4
|
||||
#define BLIS_DEFAULT_FUSE_FAC_C 4
|
||||
#define BLIS_DEFAULT_FUSE_FAC_Z 2
|
||||
|
||||
#define BLIS_AXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
|
||||
#define BLIS_AXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
|
||||
#define BLIS_AXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
|
||||
#define BLIS_AXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
|
||||
|
||||
#define BLIS_DOTXF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
|
||||
#define BLIS_DOTXF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
|
||||
#define BLIS_DOTXF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
|
||||
#define BLIS_DOTXF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
|
||||
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
|
||||
|
||||
|
||||
|
||||
// -- LEVEL-1V KERNEL CONSTANTS ------------------------------------------------
|
||||
|
||||
// -- Default register blocksizes for vectors --
|
||||
|
||||
// NOTE: Register blocksizes for vectors are used when packing
|
||||
// non-contiguous vectors. Similar to that of KR, they can
|
||||
// typically be set to 1.
|
||||
|
||||
#define BLIS_DEFAULT_VR_S 1
|
||||
#define BLIS_DEFAULT_VR_D 1
|
||||
#define BLIS_DEFAULT_VR_C 1
|
||||
#define BLIS_DEFAULT_VR_Z 1
|
||||
|
||||
|
||||
|
||||
@@ -221,17 +152,10 @@
|
||||
|
||||
// -- gemm --
|
||||
|
||||
#include "bli_gemm_opt_30x8.h"
|
||||
|
||||
#define GEMM_UKERNEL gemm_opt_30x8
|
||||
#define BLIS_DGEMM_UKERNEL bli_dgemm_opt_30x8
|
||||
|
||||
// -- trsm-related --
|
||||
|
||||
#define GEMMTRSM_L_UKERNEL gemmtrsm_l_ref_mxn
|
||||
#define GEMMTRSM_U_UKERNEL gemmtrsm_u_ref_mxn
|
||||
|
||||
#define TRSM_L_UKERNEL trsm_l_ref_mxn
|
||||
#define TRSM_U_UKERNEL trsm_u_ref_mxn
|
||||
|
||||
|
||||
|
||||
@@ -239,25 +163,8 @@
|
||||
|
||||
// -- packm --
|
||||
|
||||
#define PACKM_2XK_KERNEL packm_ref_2xk
|
||||
#define PACKM_4XK_KERNEL packm_ref_4xk
|
||||
#define PACKM_6XK_KERNEL packm_ref_6xk
|
||||
#define PACKM_8XK_KERNEL packm_ref_8xk
|
||||
#define PACKM_10XK_KERNEL packm_ref_10xk
|
||||
#define PACKM_12XK_KERNEL packm_ref_12xk
|
||||
#define PACKM_14XK_KERNEL packm_ref_14xk
|
||||
#define PACKM_16XK_KERNEL packm_ref_16xk
|
||||
|
||||
// -- unpackm --
|
||||
|
||||
#define UNPACKM_2XK_KERNEL unpackm_ref_2xk
|
||||
#define UNPACKM_4XK_KERNEL unpackm_ref_4xk
|
||||
#define UNPACKM_6XK_KERNEL unpackm_ref_6xk
|
||||
#define UNPACKM_8XK_KERNEL unpackm_ref_8xk
|
||||
#define UNPACKM_10XK_KERNEL unpackm_ref_10xk
|
||||
#define UNPACKM_12XK_KERNEL unpackm_ref_12xk
|
||||
#define UNPACKM_14XK_KERNEL unpackm_ref_14xk
|
||||
#define UNPACKM_16XK_KERNEL unpackm_ref_16xk
|
||||
|
||||
|
||||
|
||||
@@ -265,23 +172,14 @@
|
||||
|
||||
// -- axpy2v --
|
||||
|
||||
#define AXPY2V_KERNEL axpy2v_unb_var1
|
||||
|
||||
// -- dotaxpyv --
|
||||
|
||||
#define DOTAXPYV_KERNEL dotaxpyv_unb_var1
|
||||
|
||||
// -- axpyf --
|
||||
|
||||
#define AXPYF_KERNEL axpyf_unb_var1
|
||||
|
||||
// -- dotxf --
|
||||
|
||||
#define DOTXF_KERNEL dotxf_unb_var1
|
||||
|
||||
// -- dotxaxpyf --
|
||||
|
||||
#define DOTXAXPYF_KERNEL dotxaxpyf_unb_var1
|
||||
|
||||
|
||||
|
||||
@@ -289,48 +187,26 @@
|
||||
|
||||
// -- addv --
|
||||
|
||||
#define ADDV_KERNEL addv_unb_var1
|
||||
|
||||
// -- axpyv --
|
||||
|
||||
#define AXPYV_KERNEL axpyv_unb_var1
|
||||
|
||||
// -- copyv --
|
||||
|
||||
#define COPYV_KERNEL copyv_unb_var1
|
||||
|
||||
// -- dotv --
|
||||
|
||||
#define DOTV_KERNEL dotv_unb_var1
|
||||
|
||||
// -- dotxv --
|
||||
|
||||
#define DOTXV_KERNEL dotxv_unb_var1
|
||||
|
||||
// -- invertv --
|
||||
|
||||
#define INVERTV_KERNEL invertv_unb_var1
|
||||
|
||||
// -- scal2v --
|
||||
|
||||
#define SCAL2V_KERNEL scal2v_unb_var1
|
||||
|
||||
// -- scalv --
|
||||
|
||||
#define SCALV_KERNEL scalv_unb_var1
|
||||
|
||||
// -- setv --
|
||||
|
||||
#define SETV_KERNEL setv_unb_var1
|
||||
|
||||
// -- subv --
|
||||
|
||||
#define SUBV_KERNEL subv_unb_var1
|
||||
|
||||
// -- swapv --
|
||||
|
||||
#define SWAPV_KERNEL swapv_unb_var1
|
||||
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
@@ -38,7 +38,7 @@
|
||||
|
||||
// -- LEVEL-3 MICRO-KERNEL CONSTANTS -------------------------------------------
|
||||
|
||||
// -- Default cache blocksizes --
|
||||
// -- Cache blocksizes --
|
||||
|
||||
//
|
||||
// Constraints:
|
||||
@@ -70,35 +70,7 @@
|
||||
#define BLIS_DEFAULT_KC_Z 256
|
||||
#define BLIS_DEFAULT_NC_Z 2048
|
||||
|
||||
// -- Cache blocksize extensions (for optimizing edge cases) --
|
||||
|
||||
// NOTE: These cache blocksize "extensions" have the same constraints as
|
||||
// the corresponding default blocksizes above. When these values are
|
||||
// non-zero, blocksizes used at edge cases are extended (enlarged) if
|
||||
// such an extension would encompass the remaining portion of the
|
||||
// matrix dimension.
|
||||
|
||||
#define BLIS_EXTEND_MC_S 0 //(BLIS_DEFAULT_MC_S/4)
|
||||
#define BLIS_EXTEND_KC_S 0 //(BLIS_DEFAULT_KC_S/4)
|
||||
#define BLIS_EXTEND_NC_S 0 //(BLIS_DEFAULT_NC_S/4)
|
||||
|
||||
#define BLIS_EXTEND_MC_D 0 //(BLIS_DEFAULT_MC_D/4)
|
||||
#define BLIS_EXTEND_KC_D 0 //(BLIS_DEFAULT_KC_D/4)
|
||||
#define BLIS_EXTEND_NC_D 0 //(BLIS_DEFAULT_NC_D/4)
|
||||
|
||||
#define BLIS_EXTEND_MC_C 0 //(BLIS_DEFAULT_MC_C/4)
|
||||
#define BLIS_EXTEND_KC_C 0 //(BLIS_DEFAULT_KC_C/4)
|
||||
#define BLIS_EXTEND_NC_C 0 //(BLIS_DEFAULT_NC_C/4)
|
||||
|
||||
#define BLIS_EXTEND_MC_Z 0 //(BLIS_DEFAULT_MC_Z/4)
|
||||
#define BLIS_EXTEND_KC_Z 0 //(BLIS_DEFAULT_KC_Z/4)
|
||||
#define BLIS_EXTEND_NC_Z 0 //(BLIS_DEFAULT_NC_Z/4)
|
||||
|
||||
// -- Default register blocksizes for micro-kernel --
|
||||
|
||||
// NOTE: When using the reference configuration, these register blocksizes
|
||||
// in the m and n dimensions should all be equal to the size expected by
|
||||
// the reference micro-kernel(s).
|
||||
// -- Register blocksizes --
|
||||
|
||||
#define BLIS_DEFAULT_MR_S 8
|
||||
#define BLIS_DEFAULT_NR_S 4
|
||||
@@ -116,10 +88,34 @@
|
||||
// of f, handles leftover edge cases (ie: when k % f > 0) then these
|
||||
// register blocksizes in the k dimension can be defined to 1.
|
||||
|
||||
#define BLIS_DEFAULT_KR_S 1
|
||||
#define BLIS_DEFAULT_KR_D 1
|
||||
#define BLIS_DEFAULT_KR_C 1
|
||||
#define BLIS_DEFAULT_KR_Z 1
|
||||
//#define BLIS_DEFAULT_KR_S 1
|
||||
//#define BLIS_DEFAULT_KR_D 1
|
||||
//#define BLIS_DEFAULT_KR_C 1
|
||||
//#define BLIS_DEFAULT_KR_Z 1
|
||||
|
||||
// -- Cache blocksize extensions (for optimizing edge cases) --
|
||||
|
||||
// NOTE: These cache blocksize "extensions" have the same constraints as
|
||||
// the corresponding default blocksizes above. When these values are
|
||||
// non-zero, blocksizes used at edge cases are extended (enlarged) if
|
||||
// such an extension would encompass the remaining portion of the
|
||||
// matrix dimension.
|
||||
|
||||
//#define BLIS_EXTEND_MC_S 0 //(BLIS_DEFAULT_MC_S/4)
|
||||
//#define BLIS_EXTEND_KC_S 0 //(BLIS_DEFAULT_KC_S/4)
|
||||
//#define BLIS_EXTEND_NC_S 0 //(BLIS_DEFAULT_NC_S/4)
|
||||
|
||||
//#define BLIS_EXTEND_MC_D 0 //(BLIS_DEFAULT_MC_D/4)
|
||||
//#define BLIS_EXTEND_KC_D 0 //(BLIS_DEFAULT_KC_D/4)
|
||||
//#define BLIS_EXTEND_NC_D 0 //(BLIS_DEFAULT_NC_D/4)
|
||||
|
||||
//#define BLIS_EXTEND_MC_C 0 //(BLIS_DEFAULT_MC_C/4)
|
||||
//#define BLIS_EXTEND_KC_C 0 //(BLIS_DEFAULT_KC_C/4)
|
||||
//#define BLIS_EXTEND_NC_C 0 //(BLIS_DEFAULT_NC_C/4)
|
||||
|
||||
//#define BLIS_EXTEND_MC_Z 0 //(BLIS_DEFAULT_MC_Z/4)
|
||||
//#define BLIS_EXTEND_KC_Z 0 //(BLIS_DEFAULT_KC_Z/4)
|
||||
//#define BLIS_EXTEND_NC_Z 0 //(BLIS_DEFAULT_NC_Z/4)
|
||||
|
||||
// -- Register blocksize extensions (for packed micro-panels) --
|
||||
|
||||
@@ -127,93 +123,28 @@
|
||||
// leading dimensions used within the packed micro-panels are equal to
|
||||
// or greater than their corresponding register blocksizes above.
|
||||
|
||||
#define BLIS_EXTEND_MR_S 0
|
||||
#define BLIS_EXTEND_NR_S 0
|
||||
//#define BLIS_EXTEND_MR_S 0
|
||||
//#define BLIS_EXTEND_NR_S 0
|
||||
|
||||
#define BLIS_EXTEND_MR_D 0
|
||||
#define BLIS_EXTEND_NR_D 0
|
||||
//#define BLIS_EXTEND_MR_D 0
|
||||
//#define BLIS_EXTEND_NR_D 0
|
||||
|
||||
#define BLIS_EXTEND_MR_C 0
|
||||
#define BLIS_EXTEND_NR_C 0
|
||||
//#define BLIS_EXTEND_MR_C 0
|
||||
//#define BLIS_EXTEND_NR_C 0
|
||||
|
||||
#define BLIS_EXTEND_MR_Z 0
|
||||
#define BLIS_EXTEND_NR_Z 0
|
||||
//#define BLIS_EXTEND_MR_Z 0
|
||||
//#define BLIS_EXTEND_NR_Z 0
|
||||
|
||||
// Register blocksize extensions in the k dimension are not used.
|
||||
|
||||
#define BLIS_EXTEND_KR_S 0
|
||||
#define BLIS_EXTEND_KR_D 0
|
||||
#define BLIS_EXTEND_KR_C 0
|
||||
#define BLIS_EXTEND_KR_Z 0
|
||||
|
||||
|
||||
|
||||
// -- LEVEL-2 KERNEL CONSTANTS -------------------------------------------------
|
||||
|
||||
// NOTE: These values determine high-level cache blocking for level-2
|
||||
// operations ONLY. So, if gemv is performed with a 2000x2000 matrix A and
|
||||
// MC = NC = 1000, then a total of four unblocked (or unblocked fused)
|
||||
// gemv subproblems are called. The blocked algorithms are only useful in
|
||||
// that they provide the opportunity for packing vectors. (Matrices can also
|
||||
// be packed here, but this tends to be much too expensive in practice to
|
||||
// actually employ.)
|
||||
|
||||
#define BLIS_DEFAULT_L2_MC_S 1000
|
||||
#define BLIS_DEFAULT_L2_NC_S 1000
|
||||
|
||||
#define BLIS_DEFAULT_L2_MC_D 1000
|
||||
#define BLIS_DEFAULT_L2_NC_D 1000
|
||||
|
||||
#define BLIS_DEFAULT_L2_MC_C 1000
|
||||
#define BLIS_DEFAULT_L2_NC_C 1000
|
||||
|
||||
#define BLIS_DEFAULT_L2_MC_Z 1000
|
||||
#define BLIS_DEFAULT_L2_NC_Z 1000
|
||||
|
||||
|
||||
|
||||
// -- LEVEL-1F KERNEL CONSTANTS ------------------------------------------------
|
||||
|
||||
// -- Default fusing factors for level-1f operations --
|
||||
|
||||
// NOTE: Default fusing factors are not used by the reference implementations
|
||||
// of level-1f operations. They are here only for use when these operations
|
||||
// are optimized.
|
||||
|
||||
#define BLIS_DEFAULT_FUSE_FAC_S 8
|
||||
#define BLIS_DEFAULT_FUSE_FAC_D 4
|
||||
#define BLIS_DEFAULT_FUSE_FAC_C 4
|
||||
#define BLIS_DEFAULT_FUSE_FAC_Z 2
|
||||
|
||||
#define BLIS_AXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
|
||||
#define BLIS_AXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
|
||||
#define BLIS_AXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
|
||||
#define BLIS_AXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
|
||||
|
||||
#define BLIS_DOTXF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
|
||||
#define BLIS_DOTXF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
|
||||
#define BLIS_DOTXF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
|
||||
#define BLIS_DOTXF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
|
||||
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
|
||||
|
||||
|
||||
|
||||
// -- LEVEL-1V KERNEL CONSTANTS ------------------------------------------------
|
||||
|
||||
// -- Default register blocksizes for vectors --
|
||||
|
||||
// NOTE: Register blocksizes for vectors are used when packing
|
||||
// non-contiguous vectors. Similar to that of KR, they can
|
||||
// typically be set to 1.
|
||||
|
||||
#define BLIS_DEFAULT_VR_S 1
|
||||
#define BLIS_DEFAULT_VR_D 1
|
||||
#define BLIS_DEFAULT_VR_C 1
|
||||
#define BLIS_DEFAULT_VR_Z 1
|
||||
|
||||
|
||||
|
||||
@@ -221,17 +152,10 @@
|
||||
|
||||
// -- gemm --
|
||||
|
||||
#include "bli_gemm_4x6.h"
|
||||
|
||||
#define GEMM_UKERNEL gemm_4x6
|
||||
#define BLIS_DGEMM_UKERNEL bli_dgemm_4x6
|
||||
|
||||
// -- trsm-related --
|
||||
|
||||
#define GEMMTRSM_L_UKERNEL gemmtrsm_l_ref_mxn
|
||||
#define GEMMTRSM_U_UKERNEL gemmtrsm_u_ref_mxn
|
||||
|
||||
#define TRSM_L_UKERNEL trsm_l_ref_mxn
|
||||
#define TRSM_U_UKERNEL trsm_u_ref_mxn
|
||||
|
||||
|
||||
|
||||
@@ -239,25 +163,8 @@
|
||||
|
||||
// -- packm --
|
||||
|
||||
#define PACKM_2XK_KERNEL packm_ref_2xk
|
||||
#define PACKM_4XK_KERNEL packm_ref_4xk
|
||||
#define PACKM_6XK_KERNEL packm_ref_6xk
|
||||
#define PACKM_8XK_KERNEL packm_ref_8xk
|
||||
#define PACKM_10XK_KERNEL packm_ref_10xk
|
||||
#define PACKM_12XK_KERNEL packm_ref_12xk
|
||||
#define PACKM_14XK_KERNEL packm_ref_14xk
|
||||
#define PACKM_16XK_KERNEL packm_ref_16xk
|
||||
|
||||
// -- unpackm --
|
||||
|
||||
#define UNPACKM_2XK_KERNEL unpackm_ref_2xk
|
||||
#define UNPACKM_4XK_KERNEL unpackm_ref_4xk
|
||||
#define UNPACKM_6XK_KERNEL unpackm_ref_6xk
|
||||
#define UNPACKM_8XK_KERNEL unpackm_ref_8xk
|
||||
#define UNPACKM_10XK_KERNEL unpackm_ref_10xk
|
||||
#define UNPACKM_12XK_KERNEL unpackm_ref_12xk
|
||||
#define UNPACKM_14XK_KERNEL unpackm_ref_14xk
|
||||
#define UNPACKM_16XK_KERNEL unpackm_ref_16xk
|
||||
|
||||
|
||||
|
||||
@@ -265,23 +172,14 @@
|
||||
|
||||
// -- axpy2v --
|
||||
|
||||
#define AXPY2V_KERNEL axpy2v_unb_var1
|
||||
|
||||
// -- dotaxpyv --
|
||||
|
||||
#define DOTAXPYV_KERNEL dotaxpyv_unb_var1
|
||||
|
||||
// -- axpyf --
|
||||
|
||||
#define AXPYF_KERNEL axpyf_unb_var1
|
||||
|
||||
// -- dotxf --
|
||||
|
||||
#define DOTXF_KERNEL dotxf_unb_var1
|
||||
|
||||
// -- dotxaxpyf --
|
||||
|
||||
#define DOTXAXPYF_KERNEL dotxaxpyf_unb_var1
|
||||
|
||||
|
||||
|
||||
@@ -289,48 +187,26 @@
|
||||
|
||||
// -- addv --
|
||||
|
||||
#define ADDV_KERNEL addv_unb_var1
|
||||
|
||||
// -- axpyv --
|
||||
|
||||
#define AXPYV_KERNEL axpyv_unb_var1
|
||||
|
||||
// -- copyv --
|
||||
|
||||
#define COPYV_KERNEL copyv_unb_var1
|
||||
|
||||
// -- dotv --
|
||||
|
||||
#define DOTV_KERNEL dotv_unb_var1
|
||||
|
||||
// -- dotxv --
|
||||
|
||||
#define DOTXV_KERNEL dotxv_unb_var1
|
||||
|
||||
// -- invertv --
|
||||
|
||||
#define INVERTV_KERNEL invertv_unb_var1
|
||||
|
||||
// -- scal2v --
|
||||
|
||||
#define SCAL2V_KERNEL scal2v_unb_var1
|
||||
|
||||
// -- scalv --
|
||||
|
||||
#define SCALV_KERNEL scalv_unb_var1
|
||||
|
||||
// -- setv --
|
||||
|
||||
#define SETV_KERNEL setv_unb_var1
|
||||
|
||||
// -- subv --
|
||||
|
||||
#define SUBV_KERNEL subv_unb_var1
|
||||
|
||||
// -- swapv --
|
||||
|
||||
#define SWAPV_KERNEL swapv_unb_var1
|
||||
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
@@ -38,7 +38,7 @@
|
||||
|
||||
// -- LEVEL-3 MICRO-KERNEL CONSTANTS -------------------------------------------
|
||||
|
||||
// -- Default cache blocksizes --
|
||||
// -- Cache blocksizes --
|
||||
|
||||
//
|
||||
// Constraints:
|
||||
@@ -70,35 +70,7 @@
|
||||
#define BLIS_DEFAULT_KC_Z 256
|
||||
#define BLIS_DEFAULT_NC_Z 2048
|
||||
|
||||
// -- Cache blocksize extensions (for optimizing edge cases) --
|
||||
|
||||
// NOTE: These cache blocksize "extensions" have the same constraints as
|
||||
// the corresponding default blocksizes above. When these values are
|
||||
// non-zero, blocksizes used at edge cases are extended (enlarged) if
|
||||
// such an extension would encompass the remaining portion of the
|
||||
// matrix dimension.
|
||||
|
||||
#define BLIS_EXTEND_MC_S 0 //(BLIS_DEFAULT_MC_S/4)
|
||||
#define BLIS_EXTEND_KC_S 0 //(BLIS_DEFAULT_KC_S/4)
|
||||
#define BLIS_EXTEND_NC_S 0 //(BLIS_DEFAULT_NC_S/4)
|
||||
|
||||
#define BLIS_EXTEND_MC_D 0 //(BLIS_DEFAULT_MC_D/4)
|
||||
#define BLIS_EXTEND_KC_D 0 //(BLIS_DEFAULT_KC_D/4)
|
||||
#define BLIS_EXTEND_NC_D 0 //(BLIS_DEFAULT_NC_D/4)
|
||||
|
||||
#define BLIS_EXTEND_MC_C 0 //(BLIS_DEFAULT_MC_C/4)
|
||||
#define BLIS_EXTEND_KC_C 0 //(BLIS_DEFAULT_KC_C/4)
|
||||
#define BLIS_EXTEND_NC_C 0 //(BLIS_DEFAULT_NC_C/4)
|
||||
|
||||
#define BLIS_EXTEND_MC_Z 0 //(BLIS_DEFAULT_MC_Z/4)
|
||||
#define BLIS_EXTEND_KC_Z 0 //(BLIS_DEFAULT_KC_Z/4)
|
||||
#define BLIS_EXTEND_NC_Z 0 //(BLIS_DEFAULT_NC_Z/4)
|
||||
|
||||
// -- Default register blocksizes for micro-kernel --
|
||||
|
||||
// NOTE: When using the reference configuration, these register blocksizes
|
||||
// in the m and n dimensions should all be equal to the size expected by
|
||||
// the reference micro-kernel(s).
|
||||
// -- Register blocksizes --
|
||||
|
||||
#define BLIS_DEFAULT_MR_S 8
|
||||
#define BLIS_DEFAULT_NR_S 4
|
||||
@@ -116,10 +88,34 @@
|
||||
// of f, handles leftover edge cases (ie: when k % f > 0) then these
|
||||
// register blocksizes in the k dimension can be defined to 1.
|
||||
|
||||
#define BLIS_DEFAULT_KR_S 1
|
||||
#define BLIS_DEFAULT_KR_D 1
|
||||
#define BLIS_DEFAULT_KR_C 1
|
||||
#define BLIS_DEFAULT_KR_Z 1
|
||||
//#define BLIS_DEFAULT_KR_S 1
|
||||
//#define BLIS_DEFAULT_KR_D 1
|
||||
//#define BLIS_DEFAULT_KR_C 1
|
||||
//#define BLIS_DEFAULT_KR_Z 1
|
||||
|
||||
// -- Cache blocksize extensions (for optimizing edge cases) --
|
||||
|
||||
// NOTE: These cache blocksize "extensions" have the same constraints as
|
||||
// the corresponding default blocksizes above. When these values are
|
||||
// non-zero, blocksizes used at edge cases are extended (enlarged) if
|
||||
// such an extension would encompass the remaining portion of the
|
||||
// matrix dimension.
|
||||
|
||||
//#define BLIS_EXTEND_MC_S 0 //(BLIS_DEFAULT_MC_S/4)
|
||||
//#define BLIS_EXTEND_KC_S 0 //(BLIS_DEFAULT_KC_S/4)
|
||||
//#define BLIS_EXTEND_NC_S 0 //(BLIS_DEFAULT_NC_S/4)
|
||||
|
||||
//#define BLIS_EXTEND_MC_D 0 //(BLIS_DEFAULT_MC_D/4)
|
||||
//#define BLIS_EXTEND_KC_D 0 //(BLIS_DEFAULT_KC_D/4)
|
||||
//#define BLIS_EXTEND_NC_D 0 //(BLIS_DEFAULT_NC_D/4)
|
||||
|
||||
//#define BLIS_EXTEND_MC_C 0 //(BLIS_DEFAULT_MC_C/4)
|
||||
//#define BLIS_EXTEND_KC_C 0 //(BLIS_DEFAULT_KC_C/4)
|
||||
//#define BLIS_EXTEND_NC_C 0 //(BLIS_DEFAULT_NC_C/4)
|
||||
|
||||
//#define BLIS_EXTEND_MC_Z 0 //(BLIS_DEFAULT_MC_Z/4)
|
||||
//#define BLIS_EXTEND_KC_Z 0 //(BLIS_DEFAULT_KC_Z/4)
|
||||
//#define BLIS_EXTEND_NC_Z 0 //(BLIS_DEFAULT_NC_Z/4)
|
||||
|
||||
// -- Register blocksize extensions (for packed micro-panels) --
|
||||
|
||||
@@ -127,93 +123,28 @@
|
||||
// leading dimensions used within the packed micro-panels are equal to
|
||||
// or greater than their corresponding register blocksizes above.
|
||||
|
||||
#define BLIS_EXTEND_MR_S 0
|
||||
#define BLIS_EXTEND_NR_S 0
|
||||
//#define BLIS_EXTEND_MR_S 0
|
||||
//#define BLIS_EXTEND_NR_S 0
|
||||
|
||||
#define BLIS_EXTEND_MR_D 0
|
||||
#define BLIS_EXTEND_NR_D 0
|
||||
//#define BLIS_EXTEND_MR_D 0
|
||||
//#define BLIS_EXTEND_NR_D 0
|
||||
|
||||
#define BLIS_EXTEND_MR_C 0
|
||||
#define BLIS_EXTEND_NR_C 0
|
||||
//#define BLIS_EXTEND_MR_C 0
|
||||
//#define BLIS_EXTEND_NR_C 0
|
||||
|
||||
#define BLIS_EXTEND_MR_Z 0
|
||||
#define BLIS_EXTEND_NR_Z 0
|
||||
//#define BLIS_EXTEND_MR_Z 0
|
||||
//#define BLIS_EXTEND_NR_Z 0
|
||||
|
||||
// Register blocksize extensions in the k dimension are not used.
|
||||
|
||||
#define BLIS_EXTEND_KR_S 0
|
||||
#define BLIS_EXTEND_KR_D 0
|
||||
#define BLIS_EXTEND_KR_C 0
|
||||
#define BLIS_EXTEND_KR_Z 0
|
||||
|
||||
|
||||
|
||||
// -- LEVEL-2 KERNEL CONSTANTS -------------------------------------------------
|
||||
|
||||
// NOTE: These values determine high-level cache blocking for level-2
|
||||
// operations ONLY. So, if gemv is performed with a 2000x2000 matrix A and
|
||||
// MC = NC = 1000, then a total of four unblocked (or unblocked fused)
|
||||
// gemv subproblems are called. The blocked algorithms are only useful in
|
||||
// that they provide the opportunity for packing vectors. (Matrices can also
|
||||
// be packed here, but this tends to be much too expensive in practice to
|
||||
// actually employ.)
|
||||
|
||||
#define BLIS_DEFAULT_L2_MC_S 1000
|
||||
#define BLIS_DEFAULT_L2_NC_S 1000
|
||||
|
||||
#define BLIS_DEFAULT_L2_MC_D 1000
|
||||
#define BLIS_DEFAULT_L2_NC_D 1000
|
||||
|
||||
#define BLIS_DEFAULT_L2_MC_C 1000
|
||||
#define BLIS_DEFAULT_L2_NC_C 1000
|
||||
|
||||
#define BLIS_DEFAULT_L2_MC_Z 1000
|
||||
#define BLIS_DEFAULT_L2_NC_Z 1000
|
||||
|
||||
|
||||
|
||||
// -- LEVEL-1F KERNEL CONSTANTS ------------------------------------------------
|
||||
|
||||
// -- Default fusing factors for level-1f operations --
|
||||
|
||||
// NOTE: Default fusing factors are not used by the reference implementations
|
||||
// of level-1f operations. They are here only for use when these operations
|
||||
// are optimized.
|
||||
|
||||
#define BLIS_DEFAULT_FUSE_FAC_S 8
|
||||
#define BLIS_DEFAULT_FUSE_FAC_D 4
|
||||
#define BLIS_DEFAULT_FUSE_FAC_C 4
|
||||
#define BLIS_DEFAULT_FUSE_FAC_Z 2
|
||||
|
||||
#define BLIS_AXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
|
||||
#define BLIS_AXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
|
||||
#define BLIS_AXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
|
||||
#define BLIS_AXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
|
||||
|
||||
#define BLIS_DOTXF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
|
||||
#define BLIS_DOTXF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
|
||||
#define BLIS_DOTXF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
|
||||
#define BLIS_DOTXF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
|
||||
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
|
||||
|
||||
|
||||
|
||||
// -- LEVEL-1V KERNEL CONSTANTS ------------------------------------------------
|
||||
|
||||
// -- Default register blocksizes for vectors --
|
||||
|
||||
// NOTE: Register blocksizes for vectors are used when packing
|
||||
// non-contiguous vectors. Similar to that of KR, they can
|
||||
// typically be set to 1.
|
||||
|
||||
#define BLIS_DEFAULT_VR_S 1
|
||||
#define BLIS_DEFAULT_VR_D 1
|
||||
#define BLIS_DEFAULT_VR_C 1
|
||||
#define BLIS_DEFAULT_VR_Z 1
|
||||
|
||||
|
||||
|
||||
@@ -221,18 +152,12 @@
|
||||
|
||||
// -- gemm --
|
||||
|
||||
//#define GEMM_UKERNEL gemm_ref_mxn
|
||||
|
||||
#include "bli_gemm_opt_8x4.h"
|
||||
#define GEMM_UKERNEL gemm_opt_8x4
|
||||
|
||||
#define BLIS_DGEMM_UKERNEL bli_dgemm_opt_8x4
|
||||
|
||||
// -- trsm-related --
|
||||
|
||||
#define GEMMTRSM_L_UKERNEL gemmtrsm_l_ref_mxn
|
||||
#define GEMMTRSM_U_UKERNEL gemmtrsm_u_ref_mxn
|
||||
|
||||
#define TRSM_L_UKERNEL trsm_l_ref_mxn
|
||||
#define TRSM_U_UKERNEL trsm_u_ref_mxn
|
||||
|
||||
|
||||
|
||||
@@ -240,25 +165,8 @@
|
||||
|
||||
// -- packm --
|
||||
|
||||
#define PACKM_2XK_KERNEL packm_ref_2xk
|
||||
#define PACKM_4XK_KERNEL packm_ref_4xk
|
||||
#define PACKM_6XK_KERNEL packm_ref_6xk
|
||||
#define PACKM_8XK_KERNEL packm_ref_8xk
|
||||
#define PACKM_10XK_KERNEL packm_ref_10xk
|
||||
#define PACKM_12XK_KERNEL packm_ref_12xk
|
||||
#define PACKM_14XK_KERNEL packm_ref_14xk
|
||||
#define PACKM_16XK_KERNEL packm_ref_16xk
|
||||
|
||||
// -- unpackm --
|
||||
|
||||
#define UNPACKM_2XK_KERNEL unpackm_ref_2xk
|
||||
#define UNPACKM_4XK_KERNEL unpackm_ref_4xk
|
||||
#define UNPACKM_6XK_KERNEL unpackm_ref_6xk
|
||||
#define UNPACKM_8XK_KERNEL unpackm_ref_8xk
|
||||
#define UNPACKM_10XK_KERNEL unpackm_ref_10xk
|
||||
#define UNPACKM_12XK_KERNEL unpackm_ref_12xk
|
||||
#define UNPACKM_14XK_KERNEL unpackm_ref_14xk
|
||||
#define UNPACKM_16XK_KERNEL unpackm_ref_16xk
|
||||
|
||||
|
||||
|
||||
@@ -266,23 +174,14 @@
|
||||
|
||||
// -- axpy2v --
|
||||
|
||||
#define AXPY2V_KERNEL axpy2v_unb_var1
|
||||
|
||||
// -- dotaxpyv --
|
||||
|
||||
#define DOTAXPYV_KERNEL dotaxpyv_unb_var1
|
||||
|
||||
// -- axpyf --
|
||||
|
||||
#define AXPYF_KERNEL axpyf_unb_var1
|
||||
|
||||
// -- dotxf --
|
||||
|
||||
#define DOTXF_KERNEL dotxf_unb_var1
|
||||
|
||||
// -- dotxaxpyf --
|
||||
|
||||
#define DOTXAXPYF_KERNEL dotxaxpyf_unb_var1
|
||||
|
||||
|
||||
|
||||
@@ -290,48 +189,26 @@
|
||||
|
||||
// -- addv --
|
||||
|
||||
#define ADDV_KERNEL addv_unb_var1
|
||||
|
||||
// -- axpyv --
|
||||
|
||||
#define AXPYV_KERNEL axpyv_unb_var1
|
||||
|
||||
// -- copyv --
|
||||
|
||||
#define COPYV_KERNEL copyv_unb_var1
|
||||
|
||||
// -- dotv --
|
||||
|
||||
#define DOTV_KERNEL dotv_unb_var1
|
||||
|
||||
// -- dotxv --
|
||||
|
||||
#define DOTXV_KERNEL dotxv_unb_var1
|
||||
|
||||
// -- invertv --
|
||||
|
||||
#define INVERTV_KERNEL invertv_unb_var1
|
||||
|
||||
// -- scal2v --
|
||||
|
||||
#define SCAL2V_KERNEL scal2v_unb_var1
|
||||
|
||||
// -- scalv --
|
||||
|
||||
#define SCALV_KERNEL scalv_unb_var1
|
||||
|
||||
// -- setv --
|
||||
|
||||
#define SETV_KERNEL setv_unb_var1
|
||||
|
||||
// -- subv --
|
||||
|
||||
#define SUBV_KERNEL subv_unb_var1
|
||||
|
||||
// -- swapv --
|
||||
|
||||
#define SWAPV_KERNEL swapv_unb_var1
|
||||
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
@@ -35,300 +35,8 @@
|
||||
#ifndef BLIS_KERNEL_H
|
||||
#define BLIS_KERNEL_H
|
||||
|
||||
|
||||
// -- LEVEL-3 MICRO-KERNEL CONSTANTS -------------------------------------------
|
||||
|
||||
// -- Default cache blocksizes --
|
||||
|
||||
//
|
||||
// Constraints:
|
||||
//
|
||||
// (1) MC must be a multiple of:
|
||||
// (a) MR (for zero-padding purposes)
|
||||
// (b) NR (for zero-padding purposes when MR and NR are "swapped")
|
||||
// (2) NC must be a multiple of
|
||||
// (a) NR (for zero-padding purposes)
|
||||
// (b) MR (for zero-padding purposes when MR and NR are "swapped")
|
||||
// (3) KC must be a multiple of
|
||||
// (a) MR and
|
||||
// (b) NR (for triangular operations such as trmm and trsm).
|
||||
//
|
||||
|
||||
#define BLIS_DEFAULT_MC_S 64
|
||||
#define BLIS_DEFAULT_KC_S 128
|
||||
#define BLIS_DEFAULT_NC_S 4096
|
||||
|
||||
#define BLIS_DEFAULT_MC_D 64
|
||||
#define BLIS_DEFAULT_KC_D 128
|
||||
#define BLIS_DEFAULT_NC_D 4096
|
||||
|
||||
#define BLIS_DEFAULT_MC_C 64
|
||||
#define BLIS_DEFAULT_KC_C 128
|
||||
#define BLIS_DEFAULT_NC_C 4096
|
||||
|
||||
#define BLIS_DEFAULT_MC_Z 64
|
||||
#define BLIS_DEFAULT_KC_Z 128
|
||||
#define BLIS_DEFAULT_NC_Z 4096
|
||||
|
||||
// -- Cache blocksize extensions (for optimizing edge cases) --
|
||||
|
||||
// NOTE: These cache blocksize "extensions" have the same constraints as
|
||||
// the corresponding default blocksizes above. When these values are
|
||||
// non-zero, blocksizes used at edge cases are extended (enlarged) if
|
||||
// such an extension would encompass the remaining portion of the
|
||||
// matrix dimension.
|
||||
|
||||
#define BLIS_EXTEND_MC_S 0 //(BLIS_DEFAULT_MC_S/4)
|
||||
#define BLIS_EXTEND_KC_S 0 //(BLIS_DEFAULT_KC_S/4)
|
||||
#define BLIS_EXTEND_NC_S 0 //(BLIS_DEFAULT_NC_S/4)
|
||||
|
||||
#define BLIS_EXTEND_MC_D 0 //(BLIS_DEFAULT_MC_D/4)
|
||||
#define BLIS_EXTEND_KC_D 0 //(BLIS_DEFAULT_KC_D/4)
|
||||
#define BLIS_EXTEND_NC_D 0 //(BLIS_DEFAULT_NC_D/4)
|
||||
|
||||
#define BLIS_EXTEND_MC_C 0 //(BLIS_DEFAULT_MC_C/4)
|
||||
#define BLIS_EXTEND_KC_C 0 //(BLIS_DEFAULT_KC_C/4)
|
||||
#define BLIS_EXTEND_NC_C 0 //(BLIS_DEFAULT_NC_C/4)
|
||||
|
||||
#define BLIS_EXTEND_MC_Z 0 //(BLIS_DEFAULT_MC_Z/4)
|
||||
#define BLIS_EXTEND_KC_Z 0 //(BLIS_DEFAULT_KC_Z/4)
|
||||
#define BLIS_EXTEND_NC_Z 0 //(BLIS_DEFAULT_NC_Z/4)
|
||||
|
||||
// -- Default register blocksizes for micro-kernel --
|
||||
|
||||
// NOTE: When using the reference configuration, these register blocksizes
|
||||
// in the m and n dimensions should all be equal to the size expected by
|
||||
// the reference micro-kernel(s).
|
||||
|
||||
#define BLIS_DEFAULT_MR_S 8
|
||||
#define BLIS_DEFAULT_NR_S 4
|
||||
|
||||
#define BLIS_DEFAULT_MR_D 8
|
||||
#define BLIS_DEFAULT_NR_D 4
|
||||
|
||||
#define BLIS_DEFAULT_MR_C 8
|
||||
#define BLIS_DEFAULT_NR_C 4
|
||||
|
||||
#define BLIS_DEFAULT_MR_Z 8
|
||||
#define BLIS_DEFAULT_NR_Z 4
|
||||
|
||||
// NOTE: If the micro-kernel, which is typically unrolled to a factor
|
||||
// of f, handles leftover edge cases (ie: when k % f > 0) then these
|
||||
// register blocksizes in the k dimension can be defined to 1.
|
||||
|
||||
#define BLIS_DEFAULT_KR_S 1
|
||||
#define BLIS_DEFAULT_KR_D 1
|
||||
#define BLIS_DEFAULT_KR_C 1
|
||||
#define BLIS_DEFAULT_KR_Z 1
|
||||
|
||||
// -- Register blocksize extensions (for packed micro-panels) --
|
||||
|
||||
// NOTE: These register blocksize "extensions" determine whether the
|
||||
// leading dimensions used within the packed micro-panels are equal to
|
||||
// or greater than their corresponding register blocksizes above.
|
||||
|
||||
#define BLIS_EXTEND_MR_S 0
|
||||
#define BLIS_EXTEND_NR_S 0
|
||||
|
||||
#define BLIS_EXTEND_MR_D 0
|
||||
#define BLIS_EXTEND_NR_D 0
|
||||
|
||||
#define BLIS_EXTEND_MR_C 0
|
||||
#define BLIS_EXTEND_NR_C 0
|
||||
|
||||
#define BLIS_EXTEND_MR_Z 0
|
||||
#define BLIS_EXTEND_NR_Z 0
|
||||
|
||||
// Register blocksize extensions in the k dimension are not used.
|
||||
|
||||
#define BLIS_EXTEND_KR_S 0
|
||||
#define BLIS_EXTEND_KR_D 0
|
||||
#define BLIS_EXTEND_KR_C 0
|
||||
#define BLIS_EXTEND_KR_Z 0
|
||||
|
||||
|
||||
|
||||
// -- LEVEL-2 KERNEL CONSTANTS -------------------------------------------------
|
||||
|
||||
// NOTE: These values determine high-level cache blocking for level-2
|
||||
// operations ONLY. So, if gemv is performed with a 2000x2000 matrix A and
|
||||
// MC = NC = 1000, then a total of four unblocked (or unblocked fused)
|
||||
// gemv subproblems are called. The blocked algorithms are only useful in
|
||||
// that they provide the opportunity for packing vectors. (Matrices can also
|
||||
// be packed here, but this tends to be much too expensive in practice to
|
||||
// actually employ.)
|
||||
|
||||
#define BLIS_DEFAULT_L2_MC_S 1000
|
||||
#define BLIS_DEFAULT_L2_NC_S 1000
|
||||
|
||||
#define BLIS_DEFAULT_L2_MC_D 1000
|
||||
#define BLIS_DEFAULT_L2_NC_D 1000
|
||||
|
||||
#define BLIS_DEFAULT_L2_MC_C 1000
|
||||
#define BLIS_DEFAULT_L2_NC_C 1000
|
||||
|
||||
#define BLIS_DEFAULT_L2_MC_Z 1000
|
||||
#define BLIS_DEFAULT_L2_NC_Z 1000
|
||||
|
||||
|
||||
|
||||
// -- LEVEL-1F KERNEL CONSTANTS ------------------------------------------------
|
||||
|
||||
// -- Default fusing factors for level-1f operations --
|
||||
|
||||
// NOTE: Default fusing factors are not used by the reference implementations
|
||||
// of level-1f operations. They are here only for use when these operations
|
||||
// are optimized.
|
||||
|
||||
#define BLIS_DEFAULT_FUSE_FAC_S 8
|
||||
#define BLIS_DEFAULT_FUSE_FAC_D 4
|
||||
#define BLIS_DEFAULT_FUSE_FAC_C 4
|
||||
#define BLIS_DEFAULT_FUSE_FAC_Z 2
|
||||
|
||||
#define BLIS_AXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
|
||||
#define BLIS_AXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
|
||||
#define BLIS_AXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
|
||||
#define BLIS_AXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
|
||||
|
||||
#define BLIS_DOTXF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
|
||||
#define BLIS_DOTXF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
|
||||
#define BLIS_DOTXF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
|
||||
#define BLIS_DOTXF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
|
||||
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
|
||||
|
||||
|
||||
|
||||
// -- LEVEL-1V KERNEL CONSTANTS ------------------------------------------------
|
||||
|
||||
// -- Default register blocksizes for vectors --
|
||||
|
||||
// NOTE: Register blocksizes for vectors are used when packing
|
||||
// non-contiguous vectors. Similar to that of KR, they can
|
||||
// typically be set to 1.
|
||||
|
||||
#define BLIS_DEFAULT_VR_S 1
|
||||
#define BLIS_DEFAULT_VR_D 1
|
||||
#define BLIS_DEFAULT_VR_C 1
|
||||
#define BLIS_DEFAULT_VR_Z 1
|
||||
|
||||
|
||||
|
||||
// -- LEVEL-3 KERNEL DEFINITIONS -----------------------------------------------
|
||||
|
||||
// -- gemm --
|
||||
|
||||
#define GEMM_UKERNEL gemm_ref_mxn
|
||||
|
||||
// -- trsm-related --
|
||||
|
||||
#define GEMMTRSM_L_UKERNEL gemmtrsm_l_ref_mxn
|
||||
#define GEMMTRSM_U_UKERNEL gemmtrsm_u_ref_mxn
|
||||
|
||||
#define TRSM_L_UKERNEL trsm_l_ref_mxn
|
||||
#define TRSM_U_UKERNEL trsm_u_ref_mxn
|
||||
|
||||
|
||||
|
||||
// -- LEVEL-1M KERNEL DEFINITIONS ----------------------------------------------
|
||||
|
||||
// -- packm --
|
||||
|
||||
#define PACKM_2XK_KERNEL packm_ref_2xk
|
||||
#define PACKM_4XK_KERNEL packm_ref_4xk
|
||||
#define PACKM_6XK_KERNEL packm_ref_6xk
|
||||
#define PACKM_8XK_KERNEL packm_ref_8xk
|
||||
#define PACKM_10XK_KERNEL packm_ref_10xk
|
||||
#define PACKM_12XK_KERNEL packm_ref_12xk
|
||||
#define PACKM_14XK_KERNEL packm_ref_14xk
|
||||
#define PACKM_16XK_KERNEL packm_ref_16xk
|
||||
|
||||
// -- unpackm --
|
||||
|
||||
#define UNPACKM_2XK_KERNEL unpackm_ref_2xk
|
||||
#define UNPACKM_4XK_KERNEL unpackm_ref_4xk
|
||||
#define UNPACKM_6XK_KERNEL unpackm_ref_6xk
|
||||
#define UNPACKM_8XK_KERNEL unpackm_ref_8xk
|
||||
#define UNPACKM_10XK_KERNEL unpackm_ref_10xk
|
||||
#define UNPACKM_12XK_KERNEL unpackm_ref_12xk
|
||||
#define UNPACKM_14XK_KERNEL unpackm_ref_14xk
|
||||
#define UNPACKM_16XK_KERNEL unpackm_ref_16xk
|
||||
|
||||
|
||||
|
||||
// -- LEVEL-1F KERNEL DEFINITIONS ----------------------------------------------
|
||||
|
||||
// -- axpy2v --
|
||||
|
||||
#define AXPY2V_KERNEL axpy2v_unb_var1
|
||||
|
||||
// -- dotaxpyv --
|
||||
|
||||
#define DOTAXPYV_KERNEL dotaxpyv_unb_var1
|
||||
|
||||
// -- axpyf --
|
||||
|
||||
#define AXPYF_KERNEL axpyf_unb_var1
|
||||
|
||||
// -- dotxf --
|
||||
|
||||
#define DOTXF_KERNEL dotxf_unb_var1
|
||||
|
||||
// -- dotxaxpyf --
|
||||
|
||||
#define DOTXAXPYF_KERNEL dotxaxpyf_unb_var1
|
||||
|
||||
|
||||
|
||||
// -- LEVEL-1V KERNEL DEFINITIONS ----------------------------------------------
|
||||
|
||||
// -- addv --
|
||||
|
||||
#define ADDV_KERNEL addv_unb_var1
|
||||
|
||||
// -- axpyv --
|
||||
|
||||
#define AXPYV_KERNEL axpyv_unb_var1
|
||||
|
||||
// -- copyv --
|
||||
|
||||
#define COPYV_KERNEL copyv_unb_var1
|
||||
|
||||
// -- dotv --
|
||||
|
||||
#define DOTV_KERNEL dotv_unb_var1
|
||||
|
||||
// -- dotxv --
|
||||
|
||||
#define DOTXV_KERNEL dotxv_unb_var1
|
||||
|
||||
// -- invertv --
|
||||
|
||||
#define INVERTV_KERNEL invertv_unb_var1
|
||||
|
||||
// -- scal2v --
|
||||
|
||||
#define SCAL2V_KERNEL scal2v_unb_var1
|
||||
|
||||
// -- scalv --
|
||||
|
||||
#define SCALV_KERNEL scalv_unb_var1
|
||||
|
||||
// -- setv --
|
||||
|
||||
#define SETV_KERNEL setv_unb_var1
|
||||
|
||||
// -- subv --
|
||||
|
||||
#define SUBV_KERNEL subv_unb_var1
|
||||
|
||||
// -- swapv --
|
||||
|
||||
#define SWAPV_KERNEL swapv_unb_var1
|
||||
|
||||
// In the reference configuration, we let all of the defaults take
|
||||
// effect. Thus, no definitions are needed.
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
@@ -38,7 +38,7 @@
|
||||
|
||||
// -- LEVEL-3 MICRO-KERNEL CONSTANTS -------------------------------------------
|
||||
|
||||
// -- Default cache blocksizes --
|
||||
// -- Cache blocksizes --
|
||||
|
||||
//
|
||||
// Constraints:
|
||||
@@ -70,35 +70,7 @@
|
||||
#define BLIS_DEFAULT_KC_Z 128
|
||||
#define BLIS_DEFAULT_NC_Z 4096
|
||||
|
||||
// -- Cache blocksize extensions (for optimizing edge cases) --
|
||||
|
||||
// NOTE: These cache blocksize "extensions" have the same constraints as
|
||||
// the corresponding default blocksizes above. When these values are
|
||||
// non-zero, blocksizes used at edge cases are extended (enlarged) if
|
||||
// such an extension would encompass the remaining portion of the
|
||||
// matrix dimension.
|
||||
|
||||
#define BLIS_EXTEND_MC_S 0 //(BLIS_DEFAULT_MC_S/4)
|
||||
#define BLIS_EXTEND_KC_S 0 //(BLIS_DEFAULT_KC_S/4)
|
||||
#define BLIS_EXTEND_NC_S 0 //(BLIS_DEFAULT_NC_S/4)
|
||||
|
||||
#define BLIS_EXTEND_MC_D 0 //(BLIS_DEFAULT_MC_D/4)
|
||||
#define BLIS_EXTEND_KC_D 0 //(BLIS_DEFAULT_KC_D/4)
|
||||
#define BLIS_EXTEND_NC_D 0 //(BLIS_DEFAULT_NC_D/4)
|
||||
|
||||
#define BLIS_EXTEND_MC_C 0 //(BLIS_DEFAULT_MC_C/4)
|
||||
#define BLIS_EXTEND_KC_C 0 //(BLIS_DEFAULT_KC_C/4)
|
||||
#define BLIS_EXTEND_NC_C 0 //(BLIS_DEFAULT_NC_C/4)
|
||||
|
||||
#define BLIS_EXTEND_MC_Z 0 //(BLIS_DEFAULT_MC_Z/4)
|
||||
#define BLIS_EXTEND_KC_Z 0 //(BLIS_DEFAULT_KC_Z/4)
|
||||
#define BLIS_EXTEND_NC_Z 0 //(BLIS_DEFAULT_NC_Z/4)
|
||||
|
||||
// -- Default register blocksizes for micro-kernel --
|
||||
|
||||
// NOTE: When using the reference configuration, these register blocksizes
|
||||
// in the m and n dimensions should all be equal to the size expected by
|
||||
// the reference micro-kernel(s).
|
||||
// -- Register blocksizes --
|
||||
|
||||
#define BLIS_DEFAULT_MR_S 8
|
||||
#define BLIS_DEFAULT_NR_S 4
|
||||
@@ -116,10 +88,34 @@
|
||||
// of f, handles leftover edge cases (ie: when k % f > 0) then these
|
||||
// register blocksizes in the k dimension can be defined to 1.
|
||||
|
||||
#define BLIS_DEFAULT_KR_S 1
|
||||
#define BLIS_DEFAULT_KR_D 1
|
||||
#define BLIS_DEFAULT_KR_C 1
|
||||
#define BLIS_DEFAULT_KR_Z 1
|
||||
//#define BLIS_DEFAULT_KR_S 1
|
||||
//#define BLIS_DEFAULT_KR_D 1
|
||||
//#define BLIS_DEFAULT_KR_C 1
|
||||
//#define BLIS_DEFAULT_KR_Z 1
|
||||
|
||||
// -- Cache blocksize extensions (for optimizing edge cases) --
|
||||
|
||||
// NOTE: These cache blocksize "extensions" have the same constraints as
|
||||
// the corresponding default blocksizes above. When these values are
|
||||
// non-zero, blocksizes used at edge cases are extended (enlarged) if
|
||||
// such an extension would encompass the remaining portion of the
|
||||
// matrix dimension.
|
||||
|
||||
//#define BLIS_EXTEND_MC_S 0 //(BLIS_DEFAULT_MC_S/4)
|
||||
//#define BLIS_EXTEND_KC_S 0 //(BLIS_DEFAULT_KC_S/4)
|
||||
//#define BLIS_EXTEND_NC_S 0 //(BLIS_DEFAULT_NC_S/4)
|
||||
|
||||
//#define BLIS_EXTEND_MC_D 0 //(BLIS_DEFAULT_MC_D/4)
|
||||
//#define BLIS_EXTEND_KC_D 0 //(BLIS_DEFAULT_KC_D/4)
|
||||
//#define BLIS_EXTEND_NC_D 0 //(BLIS_DEFAULT_NC_D/4)
|
||||
|
||||
//#define BLIS_EXTEND_MC_C 0 //(BLIS_DEFAULT_MC_C/4)
|
||||
//#define BLIS_EXTEND_KC_C 0 //(BLIS_DEFAULT_KC_C/4)
|
||||
//#define BLIS_EXTEND_NC_C 0 //(BLIS_DEFAULT_NC_C/4)
|
||||
|
||||
//#define BLIS_EXTEND_MC_Z 0 //(BLIS_DEFAULT_MC_Z/4)
|
||||
//#define BLIS_EXTEND_KC_Z 0 //(BLIS_DEFAULT_KC_Z/4)
|
||||
//#define BLIS_EXTEND_NC_Z 0 //(BLIS_DEFAULT_NC_Z/4)
|
||||
|
||||
// -- Register blocksize extensions (for packed micro-panels) --
|
||||
|
||||
@@ -127,111 +123,39 @@
|
||||
// leading dimensions used within the packed micro-panels are equal to
|
||||
// or greater than their corresponding register blocksizes above.
|
||||
|
||||
#define BLIS_EXTEND_MR_S 0
|
||||
#define BLIS_EXTEND_NR_S 0
|
||||
//#define BLIS_EXTEND_MR_S 0
|
||||
//#define BLIS_EXTEND_NR_S 0
|
||||
|
||||
#define BLIS_EXTEND_MR_D 0
|
||||
#define BLIS_EXTEND_NR_D 0
|
||||
//#define BLIS_EXTEND_MR_D 0
|
||||
//#define BLIS_EXTEND_NR_D 0
|
||||
|
||||
#define BLIS_EXTEND_MR_C 0
|
||||
#define BLIS_EXTEND_NR_C 0
|
||||
//#define BLIS_EXTEND_MR_C 0
|
||||
//#define BLIS_EXTEND_NR_C 0
|
||||
|
||||
#define BLIS_EXTEND_MR_Z 0
|
||||
#define BLIS_EXTEND_NR_Z 0
|
||||
//#define BLIS_EXTEND_MR_Z 0
|
||||
//#define BLIS_EXTEND_NR_Z 0
|
||||
|
||||
// Register blocksize extensions in the k dimension are not used.
|
||||
|
||||
#define BLIS_EXTEND_KR_S 0
|
||||
#define BLIS_EXTEND_KR_D 0
|
||||
#define BLIS_EXTEND_KR_C 0
|
||||
#define BLIS_EXTEND_KR_Z 0
|
||||
|
||||
|
||||
|
||||
// -- LEVEL-2 KERNEL CONSTANTS -------------------------------------------------
|
||||
|
||||
// NOTE: These values determine high-level cache blocking for level-2
|
||||
// operations ONLY. So, if gemv is performed with a 2000x2000 matrix A and
|
||||
// MC = NC = 1000, then a total of four unblocked (or unblocked fused)
|
||||
// gemv subproblems are called. The blocked algorithms are only useful in
|
||||
// that they provide the opportunity for packing vectors. (Matrices can also
|
||||
// be packed here, but this tends to be much too expensive in practice to
|
||||
// actually employ.)
|
||||
|
||||
#define BLIS_DEFAULT_L2_MC_S 1000
|
||||
#define BLIS_DEFAULT_L2_NC_S 1000
|
||||
|
||||
#define BLIS_DEFAULT_L2_MC_D 1000
|
||||
#define BLIS_DEFAULT_L2_NC_D 1000
|
||||
|
||||
#define BLIS_DEFAULT_L2_MC_C 1000
|
||||
#define BLIS_DEFAULT_L2_NC_C 1000
|
||||
|
||||
#define BLIS_DEFAULT_L2_MC_Z 1000
|
||||
#define BLIS_DEFAULT_L2_NC_Z 1000
|
||||
|
||||
|
||||
|
||||
// -- LEVEL-1F KERNEL CONSTANTS ------------------------------------------------
|
||||
|
||||
// -- Default fusing factors for level-1f operations --
|
||||
|
||||
// NOTE: Default fusing factors are not used by the reference implementations
|
||||
// of level-1f operations. They are here only for use when these operations
|
||||
// are optimized.
|
||||
|
||||
#define BLIS_DEFAULT_FUSE_FAC_S 8
|
||||
#define BLIS_DEFAULT_FUSE_FAC_D 4
|
||||
#define BLIS_DEFAULT_FUSE_FAC_C 4
|
||||
#define BLIS_DEFAULT_FUSE_FAC_Z 2
|
||||
|
||||
#define BLIS_AXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
|
||||
#define BLIS_AXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
|
||||
#define BLIS_AXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
|
||||
#define BLIS_AXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
|
||||
|
||||
#define BLIS_DOTXF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
|
||||
#define BLIS_DOTXF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
|
||||
#define BLIS_DOTXF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
|
||||
#define BLIS_DOTXF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
|
||||
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
|
||||
|
||||
|
||||
|
||||
// -- LEVEL-1V KERNEL CONSTANTS ------------------------------------------------
|
||||
|
||||
// -- Default register blocksizes for vectors --
|
||||
|
||||
// NOTE: Register blocksizes for vectors are used when packing
|
||||
// non-contiguous vectors. Similar to that of KR, they can
|
||||
// typically be set to 1.
|
||||
|
||||
#define BLIS_DEFAULT_VR_S 1
|
||||
#define BLIS_DEFAULT_VR_D 1
|
||||
#define BLIS_DEFAULT_VR_C 1
|
||||
#define BLIS_DEFAULT_VR_Z 1
|
||||
|
||||
|
||||
|
||||
// -- LEVEL-3 KERNEL DEFINITIONS -----------------------------------------------
|
||||
|
||||
#include "bli_gemm_opt_8x4_ref_u4_nodupl_avx1.h"
|
||||
|
||||
// -- gemm --
|
||||
|
||||
#define GEMM_UKERNEL gemm_opt_8x4_ref_u4_nodupl_avx1
|
||||
#define BLIS_DGEMM_UKERNEL bli_dgemm_opt_8x4_ref_u4_nodupl_avx1
|
||||
|
||||
// -- trsm-related --
|
||||
|
||||
#define GEMMTRSM_L_UKERNEL gemmtrsm_l_ref_mxn
|
||||
#define GEMMTRSM_U_UKERNEL gemmtrsm_u_ref_mxn
|
||||
|
||||
#define TRSM_L_UKERNEL trsm_l_ref_mxn
|
||||
#define TRSM_U_UKERNEL trsm_u_ref_mxn
|
||||
|
||||
|
||||
|
||||
@@ -239,25 +163,8 @@
|
||||
|
||||
// -- packm --
|
||||
|
||||
#define PACKM_2XK_KERNEL packm_ref_2xk
|
||||
#define PACKM_4XK_KERNEL packm_ref_4xk
|
||||
#define PACKM_6XK_KERNEL packm_ref_6xk
|
||||
#define PACKM_8XK_KERNEL packm_ref_8xk
|
||||
#define PACKM_10XK_KERNEL packm_ref_10xk
|
||||
#define PACKM_12XK_KERNEL packm_ref_12xk
|
||||
#define PACKM_14XK_KERNEL packm_ref_14xk
|
||||
#define PACKM_16XK_KERNEL packm_ref_16xk
|
||||
|
||||
// -- unpackm --
|
||||
|
||||
#define UNPACKM_2XK_KERNEL unpackm_ref_2xk
|
||||
#define UNPACKM_4XK_KERNEL unpackm_ref_4xk
|
||||
#define UNPACKM_6XK_KERNEL unpackm_ref_6xk
|
||||
#define UNPACKM_8XK_KERNEL unpackm_ref_8xk
|
||||
#define UNPACKM_10XK_KERNEL unpackm_ref_10xk
|
||||
#define UNPACKM_12XK_KERNEL unpackm_ref_12xk
|
||||
#define UNPACKM_14XK_KERNEL unpackm_ref_14xk
|
||||
#define UNPACKM_16XK_KERNEL unpackm_ref_16xk
|
||||
|
||||
|
||||
|
||||
@@ -265,23 +172,14 @@
|
||||
|
||||
// -- axpy2v --
|
||||
|
||||
#define AXPY2V_KERNEL axpy2v_unb_var1
|
||||
|
||||
// -- dotaxpyv --
|
||||
|
||||
#define DOTAXPYV_KERNEL dotaxpyv_unb_var1
|
||||
|
||||
// -- axpyf --
|
||||
|
||||
#define AXPYF_KERNEL axpyf_unb_var1
|
||||
|
||||
// -- dotxf --
|
||||
|
||||
#define DOTXF_KERNEL dotxf_unb_var1
|
||||
|
||||
// -- dotxaxpyf --
|
||||
|
||||
#define DOTXAXPYF_KERNEL dotxaxpyf_unb_var1
|
||||
|
||||
|
||||
|
||||
@@ -289,48 +187,26 @@
|
||||
|
||||
// -- addv --
|
||||
|
||||
#define ADDV_KERNEL addv_unb_var1
|
||||
|
||||
// -- axpyv --
|
||||
|
||||
#define AXPYV_KERNEL axpyv_unb_var1
|
||||
|
||||
// -- copyv --
|
||||
|
||||
#define COPYV_KERNEL copyv_unb_var1
|
||||
|
||||
// -- dotv --
|
||||
|
||||
#define DOTV_KERNEL dotv_unb_var1
|
||||
|
||||
// -- dotxv --
|
||||
|
||||
#define DOTXV_KERNEL dotxv_unb_var1
|
||||
|
||||
// -- invertv --
|
||||
|
||||
#define INVERTV_KERNEL invertv_unb_var1
|
||||
|
||||
// -- scal2v --
|
||||
|
||||
#define SCAL2V_KERNEL scal2v_unb_var1
|
||||
|
||||
// -- scalv --
|
||||
|
||||
#define SCALV_KERNEL scalv_unb_var1
|
||||
|
||||
// -- setv --
|
||||
|
||||
#define SETV_KERNEL setv_unb_var1
|
||||
|
||||
// -- subv --
|
||||
|
||||
#define SUBV_KERNEL subv_unb_var1
|
||||
|
||||
// -- swapv --
|
||||
|
||||
#define SWAPV_KERNEL swapv_unb_var1
|
||||
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
@@ -38,9 +38,8 @@
|
||||
|
||||
// -- LEVEL-3 MICRO-KERNEL CONSTANTS -------------------------------------------
|
||||
|
||||
// -- Default cache blocksizes --
|
||||
// -- Cache blocksizes --
|
||||
|
||||
//
|
||||
// Constraints:
|
||||
//
|
||||
// (1) MC must be a multiple of:
|
||||
@@ -52,53 +51,24 @@
|
||||
// (3) KC must be a multiple of
|
||||
// (a) MR and
|
||||
// (b) NR (for triangular operations such as trmm and trsm).
|
||||
//
|
||||
|
||||
#define BLIS_DEFAULT_MC_S 64
|
||||
#define BLIS_DEFAULT_KC_S 128
|
||||
#define BLIS_DEFAULT_NC_S 4096
|
||||
#define BLIS_DEFAULT_MC_S 128
|
||||
#define BLIS_DEFAULT_KC_S 256
|
||||
#define BLIS_DEFAULT_NC_S 2048
|
||||
|
||||
#define BLIS_DEFAULT_MC_D 64
|
||||
#define BLIS_DEFAULT_KC_D 128
|
||||
#define BLIS_DEFAULT_NC_D 4096
|
||||
#define BLIS_DEFAULT_MC_D 128
|
||||
#define BLIS_DEFAULT_KC_D 256
|
||||
#define BLIS_DEFAULT_NC_D 2048
|
||||
|
||||
#define BLIS_DEFAULT_MC_C 64
|
||||
#define BLIS_DEFAULT_KC_C 128
|
||||
#define BLIS_DEFAULT_NC_C 4096
|
||||
#define BLIS_DEFAULT_MC_C 128
|
||||
#define BLIS_DEFAULT_KC_C 256
|
||||
#define BLIS_DEFAULT_NC_C 2048
|
||||
|
||||
#define BLIS_DEFAULT_MC_Z 64
|
||||
#define BLIS_DEFAULT_KC_Z 128
|
||||
#define BLIS_DEFAULT_NC_Z 4096
|
||||
#define BLIS_DEFAULT_MC_Z 128
|
||||
#define BLIS_DEFAULT_KC_Z 256
|
||||
#define BLIS_DEFAULT_NC_Z 2048
|
||||
|
||||
// -- Cache blocksize extensions (for optimizing edge cases) --
|
||||
|
||||
// NOTE: These cache blocksize "extensions" have the same constraints as
|
||||
// the corresponding default blocksizes above. When these values are
|
||||
// non-zero, blocksizes used at edge cases are extended (enlarged) if
|
||||
// such an extension would encompass the remaining portion of the
|
||||
// matrix dimension.
|
||||
|
||||
#define BLIS_EXTEND_MC_S 0 //(BLIS_DEFAULT_MC_S/4)
|
||||
#define BLIS_EXTEND_KC_S 0 //(BLIS_DEFAULT_KC_S/4)
|
||||
#define BLIS_EXTEND_NC_S 0 //(BLIS_DEFAULT_NC_S/4)
|
||||
|
||||
#define BLIS_EXTEND_MC_D 0 //(BLIS_DEFAULT_MC_D/4)
|
||||
#define BLIS_EXTEND_KC_D 0 //(BLIS_DEFAULT_KC_D/4)
|
||||
#define BLIS_EXTEND_NC_D 0 //(BLIS_DEFAULT_NC_D/4)
|
||||
|
||||
#define BLIS_EXTEND_MC_C 0 //(BLIS_DEFAULT_MC_C/4)
|
||||
#define BLIS_EXTEND_KC_C 0 //(BLIS_DEFAULT_KC_C/4)
|
||||
#define BLIS_EXTEND_NC_C 0 //(BLIS_DEFAULT_NC_C/4)
|
||||
|
||||
#define BLIS_EXTEND_MC_Z 0 //(BLIS_DEFAULT_MC_Z/4)
|
||||
#define BLIS_EXTEND_KC_Z 0 //(BLIS_DEFAULT_KC_Z/4)
|
||||
#define BLIS_EXTEND_NC_Z 0 //(BLIS_DEFAULT_NC_Z/4)
|
||||
|
||||
// -- Default register blocksizes for micro-kernel --
|
||||
|
||||
// NOTE: When using the reference configuration, these register blocksizes
|
||||
// in the m and n dimensions should all be equal to the size expected by
|
||||
// the reference micro-kernel(s).
|
||||
// -- Register blocksizes --
|
||||
|
||||
#define BLIS_DEFAULT_MR_S 8
|
||||
#define BLIS_DEFAULT_NR_S 4
|
||||
@@ -116,10 +86,34 @@
|
||||
// of f, handles leftover edge cases (ie: when k % f > 0) then these
|
||||
// register blocksizes in the k dimension can be defined to 1.
|
||||
|
||||
#define BLIS_DEFAULT_KR_S 1
|
||||
#define BLIS_DEFAULT_KR_D 1
|
||||
#define BLIS_DEFAULT_KR_C 1
|
||||
#define BLIS_DEFAULT_KR_Z 1
|
||||
//#define BLIS_DEFAULT_KR_S 1
|
||||
//#define BLIS_DEFAULT_KR_D 1
|
||||
//#define BLIS_DEFAULT_KR_C 1
|
||||
//#define BLIS_DEFAULT_KR_Z 1
|
||||
|
||||
// -- Cache blocksize extensions (for optimizing edge cases) --
|
||||
|
||||
// NOTE: These cache blocksize "extensions" have the same constraints as
|
||||
// the corresponding default blocksizes above. When these values are
|
||||
// non-zero, blocksizes used at edge cases are extended (enlarged) if
|
||||
// such an extension would encompass the remaining portion of the
|
||||
// matrix dimension.
|
||||
|
||||
//#define BLIS_EXTEND_MC_S 0
|
||||
//#define BLIS_EXTEND_KC_S 0
|
||||
//#define BLIS_EXTEND_NC_S 0
|
||||
|
||||
//#define BLIS_EXTEND_MC_D 0
|
||||
//#define BLIS_EXTEND_KC_D 0
|
||||
//#define BLIS_EXTEND_NC_D 0
|
||||
|
||||
//#define BLIS_EXTEND_MC_C 0
|
||||
//#define BLIS_EXTEND_KC_C 0
|
||||
//#define BLIS_EXTEND_NC_C 0
|
||||
|
||||
//#define BLIS_EXTEND_MC_Z 0
|
||||
//#define BLIS_EXTEND_KC_Z 0
|
||||
//#define BLIS_EXTEND_NC_Z 0
|
||||
|
||||
// -- Register blocksize extensions (for packed micro-panels) --
|
||||
|
||||
@@ -127,24 +121,52 @@
|
||||
// leading dimensions used within the packed micro-panels are equal to
|
||||
// or greater than their corresponding register blocksizes above.
|
||||
|
||||
#define BLIS_EXTEND_MR_S 0
|
||||
#define BLIS_EXTEND_NR_S 0
|
||||
//#define BLIS_EXTEND_MR_S 0
|
||||
//#define BLIS_EXTEND_NR_S 0
|
||||
|
||||
#define BLIS_EXTEND_MR_D 0
|
||||
#define BLIS_EXTEND_NR_D 0
|
||||
//#define BLIS_EXTEND_MR_D 0
|
||||
//#define BLIS_EXTEND_NR_D 0
|
||||
|
||||
#define BLIS_EXTEND_MR_C 0
|
||||
#define BLIS_EXTEND_NR_C 0
|
||||
//#define BLIS_EXTEND_MR_C 0
|
||||
//#define BLIS_EXTEND_NR_C 0
|
||||
|
||||
#define BLIS_EXTEND_MR_Z 0
|
||||
#define BLIS_EXTEND_NR_Z 0
|
||||
//#define BLIS_EXTEND_MR_Z 0
|
||||
//#define BLIS_EXTEND_NR_Z 0
|
||||
|
||||
// Register blocksize extensions in the k dimension are not used.
|
||||
|
||||
#define BLIS_EXTEND_KR_S 0
|
||||
#define BLIS_EXTEND_KR_D 0
|
||||
#define BLIS_EXTEND_KR_C 0
|
||||
#define BLIS_EXTEND_KR_Z 0
|
||||
|
||||
|
||||
// -- LEVEL-3 MICRO-KERNELS ---------------------------------------------------
|
||||
|
||||
// -- gemm --
|
||||
|
||||
#define BLIS_SGEMM_UKERNEL bli_sgemm_opt_mxn
|
||||
#define BLIS_DGEMM_UKERNEL bli_dgemm_opt_mxn
|
||||
#define BLIS_CGEMM_UKERNEL bli_cgemm_opt_mxn
|
||||
#define BLIS_ZGEMM_UKERNEL bli_zgemm_opt_mxn
|
||||
|
||||
// -- trsm-related --
|
||||
|
||||
#define BLIS_SGEMMTRSM_L_UKERNEL bli_sgemmtrsm_l_opt_mxn
|
||||
#define BLIS_DGEMMTRSM_L_UKERNEL bli_dgemmtrsm_l_opt_mxn
|
||||
#define BLIS_CGEMMTRSM_L_UKERNEL bli_cgemmtrsm_l_opt_mxn
|
||||
#define BLIS_ZGEMMTRSM_L_UKERNEL bli_zgemmtrsm_l_opt_mxn
|
||||
|
||||
#define BLIS_SGEMMTRSM_U_UKERNEL bli_sgemmtrsm_u_opt_mxn
|
||||
#define BLIS_DGEMMTRSM_U_UKERNEL bli_dgemmtrsm_u_opt_mxn
|
||||
#define BLIS_CGEMMTRSM_U_UKERNEL bli_cgemmtrsm_u_opt_mxn
|
||||
#define BLIS_ZGEMMTRSM_U_UKERNEL bli_zgemmtrsm_u_opt_mxn
|
||||
|
||||
#define BLIS_STRSM_L_UKERNEL bli_strsm_l_opt_mxn
|
||||
#define BLIS_DTRSM_L_UKERNEL bli_dtrsm_l_opt_mxn
|
||||
#define BLIS_CTRSM_L_UKERNEL bli_ctrsm_l_opt_mxn
|
||||
#define BLIS_ZTRSM_L_UKERNEL bli_ztrsm_l_opt_mxn
|
||||
|
||||
#define BLIS_STRSM_U_UKERNEL bli_strsm_u_opt_mxn
|
||||
#define BLIS_DTRSM_U_UKERNEL bli_dtrsm_u_opt_mxn
|
||||
#define BLIS_CTRSM_U_UKERNEL bli_ctrsm_u_opt_mxn
|
||||
#define BLIS_ZTRSM_U_UKERNEL bli_ztrsm_u_opt_mxn
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -158,17 +180,18 @@
|
||||
// be packed here, but this tends to be much too expensive in practice to
|
||||
// actually employ.)
|
||||
|
||||
#define BLIS_DEFAULT_L2_MC_S 1000
|
||||
#define BLIS_DEFAULT_L2_NC_S 1000
|
||||
//#define BLIS_DEFAULT_L2_MC_S 1000
|
||||
//#define BLIS_DEFAULT_L2_NC_S 1000
|
||||
|
||||
#define BLIS_DEFAULT_L2_MC_D 1000
|
||||
#define BLIS_DEFAULT_L2_NC_D 1000
|
||||
//#define BLIS_DEFAULT_L2_MC_D 1000
|
||||
//#define BLIS_DEFAULT_L2_NC_D 1000
|
||||
|
||||
#define BLIS_DEFAULT_L2_MC_C 1000
|
||||
#define BLIS_DEFAULT_L2_NC_C 1000
|
||||
//#define BLIS_DEFAULT_L2_MC_C 1000
|
||||
//#define BLIS_DEFAULT_L2_NC_C 1000
|
||||
|
||||
//#define BLIS_DEFAULT_L2_MC_Z 1000
|
||||
//#define BLIS_DEFAULT_L2_NC_Z 1000
|
||||
|
||||
#define BLIS_DEFAULT_L2_MC_Z 1000
|
||||
#define BLIS_DEFAULT_L2_NC_Z 1000
|
||||
|
||||
|
||||
|
||||
@@ -176,66 +199,67 @@
|
||||
|
||||
// -- Default fusing factors for level-1f operations --
|
||||
|
||||
// NOTE: Default fusing factors are not used by the reference implementations
|
||||
// of level-1f operations. They are here only for use when these operations
|
||||
// are optimized.
|
||||
//#define BLIS_L1F_FUSE_FAC_S 8
|
||||
//#define BLIS_L1F_FUSE_FAC_D 4
|
||||
//#define BLIS_L1F_FUSE_FAC_C 4
|
||||
//#define BLIS_L1F_FUSE_FAC_Z 2
|
||||
|
||||
#define BLIS_DEFAULT_FUSE_FAC_S 8
|
||||
#define BLIS_DEFAULT_FUSE_FAC_D 4
|
||||
#define BLIS_DEFAULT_FUSE_FAC_C 4
|
||||
#define BLIS_DEFAULT_FUSE_FAC_Z 2
|
||||
//#define BLIS_AXPYF_FUSE_FAC_S BLIS_L1F_FUSE_FAC_S
|
||||
//#define BLIS_AXPYF_FUSE_FAC_D BLIS_L1F_FUSE_FAC_D
|
||||
//#define BLIS_AXPYF_FUSE_FAC_C BLIS_L1F_FUSE_FAC_C
|
||||
//#define BLIS_AXPYF_FUSE_FAC_Z BLIS_L1F_FUSE_FAC_Z
|
||||
|
||||
#define BLIS_AXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
|
||||
#define BLIS_AXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
|
||||
#define BLIS_AXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
|
||||
#define BLIS_AXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
|
||||
//#define BLIS_DOTXF_FUSE_FAC_S BLIS_L1F_FUSE_FAC_S
|
||||
//#define BLIS_DOTXF_FUSE_FAC_D BLIS_L1F_FUSE_FAC_D
|
||||
//#define BLIS_DOTXF_FUSE_FAC_C BLIS_L1F_FUSE_FAC_C
|
||||
//#define BLIS_DOTXF_FUSE_FAC_Z BLIS_L1F_FUSE_FAC_Z
|
||||
|
||||
#define BLIS_DOTXF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
|
||||
#define BLIS_DOTXF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
|
||||
#define BLIS_DOTXF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
|
||||
#define BLIS_DOTXF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
|
||||
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
|
||||
//#define BLIS_DOTXAXPYF_FUSE_FAC_S BLIS_L1F_FUSE_FAC_S
|
||||
//#define BLIS_DOTXAXPYF_FUSE_FAC_D BLIS_L1F_FUSE_FAC_D
|
||||
//#define BLIS_DOTXAXPYF_FUSE_FAC_C BLIS_L1F_FUSE_FAC_C
|
||||
//#define BLIS_DOTXAXPYF_FUSE_FAC_Z BLIS_L1F_FUSE_FAC_Z
|
||||
|
||||
|
||||
|
||||
// -- LEVEL-1V KERNEL CONSTANTS ------------------------------------------------
|
||||
|
||||
// -- Default register blocksizes for vectors --
|
||||
// -- LEVEL-1F KERNEL DEFINITIONS ----------------------------------------------
|
||||
|
||||
// NOTE: Register blocksizes for vectors are used when packing
|
||||
// non-contiguous vectors. Similar to that of KR, they can
|
||||
// typically be set to 1.
|
||||
// -- axpy2v --
|
||||
|
||||
#define BLIS_DEFAULT_VR_S 1
|
||||
#define BLIS_DEFAULT_VR_D 1
|
||||
#define BLIS_DEFAULT_VR_C 1
|
||||
#define BLIS_DEFAULT_VR_Z 1
|
||||
#define BLIS_SAXPY2V_KERNEL bli_saxpy2v_opt_var1
|
||||
#define BLIS_DAXPY2V_KERNEL bli_daxpy2v_opt_var1
|
||||
#define BLIS_CAXPY2V_KERNEL bli_caxpy2v_opt_var1
|
||||
#define BLIS_ZAXPY2V_KERNEL bli_zaxpy2v_opt_var1
|
||||
|
||||
// -- dotaxpyv --
|
||||
|
||||
#define BLIS_SDOTAXPYV_KERNEL bli_sdotaxpyv_opt_var1
|
||||
#define BLIS_DDOTAXPYV_KERNEL bli_ddotaxpyv_opt_var1
|
||||
#define BLIS_CDOTAXPYV_KERNEL bli_cdotaxpyv_opt_var1
|
||||
#define BLIS_ZDOTAXPYV_KERNEL bli_zdotaxpyv_opt_var1
|
||||
|
||||
// -- axpyf --
|
||||
|
||||
#define BLIS_SAXPYF_KERNEL bli_saxpyf_opt_var1
|
||||
#define BLIS_DAXPYF_KERNEL bli_daxpyf_opt_var1
|
||||
#define BLIS_CAXPYF_KERNEL bli_caxpyf_opt_var1
|
||||
#define BLIS_ZAXPYF_KERNEL bli_zaxpyf_opt_var1
|
||||
|
||||
// -- dotxf --
|
||||
|
||||
#define BLIS_SDOTXF_KERNEL bli_sdotxf_opt_var1
|
||||
#define BLIS_DDOTXF_KERNEL bli_ddotxf_opt_var1
|
||||
#define BLIS_CDOTXF_KERNEL bli_cdotxf_opt_var1
|
||||
#define BLIS_ZDOTXF_KERNEL bli_zdotxf_opt_var1
|
||||
|
||||
|
||||
// -- dotxaxpyf --
|
||||
|
||||
// -- LEVEL-3 KERNEL DEFINITIONS -----------------------------------------------
|
||||
#define BLIS_SDOTXAXPYF_KERNEL bli_sdotxaxpyf_opt_var1
|
||||
#define BLIS_DDOTXAXPYF_KERNEL bli_ddotxaxpyf_opt_var1
|
||||
#define BLIS_CDOTXAXPYF_KERNEL bli_cdotxaxpyf_opt_var1
|
||||
#define BLIS_ZDOTXAXPYF_KERNEL bli_zdotxaxpyf_opt_var1
|
||||
|
||||
#include "bli_gemm_opt_mxn.h"
|
||||
#include "bli_trsm_l_opt_mxn.h"
|
||||
#include "bli_trsm_u_opt_mxn.h"
|
||||
#include "bli_gemmtrsm_l_opt_mxn.h"
|
||||
#include "bli_gemmtrsm_u_opt_mxn.h"
|
||||
|
||||
// -- gemm --
|
||||
|
||||
#define GEMM_UKERNEL gemm_opt_mxn
|
||||
|
||||
// -- trsm-related --
|
||||
|
||||
#define GEMMTRSM_L_UKERNEL gemmtrsm_l_opt_mxn
|
||||
#define GEMMTRSM_U_UKERNEL gemmtrsm_u_opt_mxn
|
||||
|
||||
#define TRSM_L_UKERNEL trsm_l_opt_mxn
|
||||
#define TRSM_U_UKERNEL trsm_u_opt_mxn
|
||||
|
||||
|
||||
|
||||
@@ -243,55 +267,30 @@
|
||||
|
||||
// -- packm --
|
||||
|
||||
#define PACKM_2XK_KERNEL packm_ref_2xk
|
||||
#define PACKM_4XK_KERNEL packm_ref_4xk
|
||||
#define PACKM_6XK_KERNEL packm_ref_6xk
|
||||
#define PACKM_8XK_KERNEL packm_ref_8xk
|
||||
#define PACKM_10XK_KERNEL packm_ref_10xk
|
||||
#define PACKM_12XK_KERNEL packm_ref_12xk
|
||||
#define PACKM_14XK_KERNEL packm_ref_14xk
|
||||
#define PACKM_16XK_KERNEL packm_ref_16xk
|
||||
//#define BLIS_SPACKM_2XK_KERNEL bli_spackm_ref_2xk
|
||||
//#define BLIS_DPACKM_2XK_KERNEL bli_dpackm_ref_2xk
|
||||
//#define BLIS_CPACKM_2XK_KERNEL bli_cpackm_ref_2xk
|
||||
//#define BLIS_ZPACKM_2XK_KERNEL bli_zpackm_ref_2xk
|
||||
|
||||
// -- unpackm --
|
||||
//#define BLIS_SPACKM_4XK_KERNEL bli_spackm_ref_4xk
|
||||
//#define BLIS_DPACKM_4XK_KERNEL bli_dpackm_ref_4xk
|
||||
//#define BLIS_CPACKM_4XK_KERNEL bli_cpackm_ref_4xk
|
||||
//#define BLIS_ZPACKM_4XK_KERNEL bli_zpackm_ref_4xk
|
||||
|
||||
#define UNPACKM_2XK_KERNEL unpackm_ref_2xk
|
||||
#define UNPACKM_4XK_KERNEL unpackm_ref_4xk
|
||||
#define UNPACKM_6XK_KERNEL unpackm_ref_6xk
|
||||
#define UNPACKM_8XK_KERNEL unpackm_ref_8xk
|
||||
#define UNPACKM_10XK_KERNEL unpackm_ref_10xk
|
||||
#define UNPACKM_12XK_KERNEL unpackm_ref_12xk
|
||||
#define UNPACKM_14XK_KERNEL unpackm_ref_14xk
|
||||
#define UNPACKM_16XK_KERNEL unpackm_ref_16xk
|
||||
//#define BLIS_SPACKM_6XK_KERNEL bli_spackm_ref_6xk
|
||||
//#define BLIS_DPACKM_6XK_KERNEL bli_dpackm_ref_6xk
|
||||
//#define BLIS_CPACKM_6XK_KERNEL bli_cpackm_ref_6xk
|
||||
//#define BLIS_ZPACKM_6XK_KERNEL bli_zpackm_ref_6xk
|
||||
|
||||
//#define BLIS_SPACKM_8XK_KERNEL bli_spackm_ref_8xk
|
||||
//#define BLIS_DPACKM_8XK_KERNEL bli_dpackm_ref_8xk
|
||||
//#define BLIS_CPACKM_8XK_KERNEL bli_cpackm_ref_8xk
|
||||
//#define BLIS_ZPACKM_8XK_KERNEL bli_zpackm_ref_8xk
|
||||
|
||||
// ...
|
||||
|
||||
// -- LEVEL-1F KERNEL DEFINITIONS ----------------------------------------------
|
||||
// (Commented definitions for 10, 12, 14, and 16 not shown).
|
||||
|
||||
#include "bli_axpy2v_opt_var1.h"
|
||||
#include "bli_dotaxpyv_opt_var1.h"
|
||||
#include "bli_axpyf_opt_var1.h"
|
||||
#include "bli_dotxf_opt_var1.h"
|
||||
#include "bli_dotxaxpyf_opt_var1.h"
|
||||
|
||||
// -- axpy2v --
|
||||
|
||||
#define AXPY2V_KERNEL axpy2v_opt_var1
|
||||
|
||||
// -- dotaxpyv --
|
||||
|
||||
#define DOTAXPYV_KERNEL dotaxpyv_opt_var1
|
||||
|
||||
// -- axpyf --
|
||||
|
||||
#define AXPYF_KERNEL axpyf_opt_var1
|
||||
|
||||
// -- dotxf --
|
||||
|
||||
#define DOTXF_KERNEL dotxf_opt_var1
|
||||
|
||||
// -- dotxaxpyf --
|
||||
|
||||
#define DOTXAXPYF_KERNEL dotxaxpyf_opt_var1
|
||||
|
||||
|
||||
|
||||
@@ -299,47 +298,81 @@
|
||||
|
||||
// -- addv --
|
||||
|
||||
#define ADDV_KERNEL addv_unb_var1
|
||||
//#define BLIS_SADDV_KERNEL bli_saddv_unb_var1
|
||||
//#define BLIS_DADDV_KERNEL bli_daddv_unb_var1
|
||||
//#define BLIS_CADDV_KERNEL bli_caddv_unb_var1
|
||||
//#define BLIS_ZADDV_KERNEL bli_zaddv_unb_var1
|
||||
|
||||
// -- axpyv --
|
||||
|
||||
#define AXPYV_KERNEL axpyv_unb_var1
|
||||
#define BLIS_SAXPYV_KERNEL bli_saxpyv_opt_var1
|
||||
#define BLIS_DAXPYV_KERNEL bli_daxpyv_opt_var1
|
||||
#define BLIS_CAXPYV_KERNEL bli_caxpyv_opt_var1
|
||||
#define BLIS_ZAXPYV_KERNEL bli_zaxpyv_opt_var1
|
||||
|
||||
// -- copyv --
|
||||
|
||||
#define COPYV_KERNEL copyv_unb_var1
|
||||
//#define BLIS_SCOPYV_KERNEL bli_scopyv_unb_var1
|
||||
//#define BLIS_DCOPYV_KERNEL bli_dcopyv_unb_var1
|
||||
//#define BLIS_CCOPYV_KERNEL bli_ccopyv_unb_var1
|
||||
//#define BLIS_ZCOPYV_KERNEL bli_zcopyv_unb_var1
|
||||
|
||||
// -- dotv --
|
||||
|
||||
#define DOTV_KERNEL dotv_unb_var1
|
||||
#define BLIS_SDOTV_KERNEL bli_sdotv_opt_var1
|
||||
#define BLIS_DDOTV_KERNEL bli_ddotv_opt_var1
|
||||
#define BLIS_CDOTV_KERNEL bli_cdotv_opt_var1
|
||||
#define BLIS_ZDOTV_KERNEL bli_zdotv_opt_var1
|
||||
|
||||
// -- dotxv --
|
||||
|
||||
#define DOTXV_KERNEL dotxv_unb_var1
|
||||
//#define BLIS_SDOTXV_KERNEL bli_sdotxv_unb_var1
|
||||
//#define BLIS_DDOTXV_KERNEL bli_ddotxv_unb_var1
|
||||
//#define BLIS_CDOTXV_KERNEL bli_cdotxv_unb_var1
|
||||
//#define BLIS_ZDOTXV_KERNEL bli_zdotxv_unb_var1
|
||||
|
||||
// -- invertv --
|
||||
|
||||
#define INVERTV_KERNEL invertv_unb_var1
|
||||
//#define BLIS_SINVERTV_KERNEL bli_sinvertv_unb_var1
|
||||
//#define BLIS_DINVERTV_KERNEL bli_dinvertv_unb_var1
|
||||
//#define BLIS_CINVERTV_KERNEL bli_cinvertv_unb_var1
|
||||
//#define BLIS_ZINVERTV_KERNEL bli_zinvertv_unb_var1
|
||||
|
||||
// -- scal2v --
|
||||
|
||||
#define SCAL2V_KERNEL scal2v_unb_var1
|
||||
//#define BLIS_SSCAL2V_KERNEL bli_sscal2v_unb_var1
|
||||
//#define BLIS_DSCAL2V_KERNEL bli_dscal2v_unb_var1
|
||||
//#define BLIS_CSCAL2V_KERNEL bli_cscal2v_unb_var1
|
||||
//#define BLIS_ZSCAL2V_KERNEL bli_zscal2v_unb_var1
|
||||
|
||||
// -- scalv --
|
||||
|
||||
#define SCALV_KERNEL scalv_unb_var1
|
||||
//#define BLIS_SSCALV_KERNEL bli_sscalv_unb_var1
|
||||
//#define BLIS_DSCALV_KERNEL bli_dscalv_unb_var1
|
||||
//#define BLIS_CSCALV_KERNEL bli_cscalv_unb_var1
|
||||
//#define BLIS_ZSCALV_KERNEL bli_zscalv_unb_var1
|
||||
|
||||
// -- setv --
|
||||
|
||||
#define SETV_KERNEL setv_unb_var1
|
||||
//#define BLIS_SSETV_KERNEL bli_ssetv_unb_var1
|
||||
//#define BLIS_DSETV_KERNEL bli_dsetv_unb_var1
|
||||
//#define BLIS_CSETV_KERNEL bli_csetv_unb_var1
|
||||
//#define BLIS_ZSETV_KERNEL bli_zsetv_unb_var1
|
||||
|
||||
// -- subv --
|
||||
|
||||
#define SUBV_KERNEL subv_unb_var1
|
||||
//#define BLIS_SSUBV_KERNEL bli_ssubv_unb_var1
|
||||
//#define BLIS_DSUBV_KERNEL bli_dsubv_unb_var1
|
||||
//#define BLIS_CSUBV_KERNEL bli_csubv_unb_var1
|
||||
//#define BLIS_ZSUBV_KERNEL bli_zsubv_unb_var1
|
||||
|
||||
// -- swapv --
|
||||
|
||||
#define SWAPV_KERNEL swapv_unb_var1
|
||||
//#define BLIS_SSWAPV_KERNEL bli_sswapv_unb_var1
|
||||
//#define BLIS_DSWAPV_KERNEL bli_dswapv_unb_var1
|
||||
//#define BLIS_CSWAPV_KERNEL bli_cswapv_unb_var1
|
||||
//#define BLIS_ZSWAPV_KERNEL bli_zswapv_unb_var1
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -36,59 +36,59 @@
|
||||
|
||||
|
||||
|
||||
void bli_sssaxpyv_opt_var1( conj_t conjx,
|
||||
dim_t n,
|
||||
float* restrict alpha,
|
||||
float* restrict x, inc_t incx,
|
||||
float* restrict y, inc_t incy )
|
||||
void bli_saxpyv_opt_var1( conj_t conjx,
|
||||
dim_t n,
|
||||
float* restrict alpha,
|
||||
float* restrict x, inc_t incx,
|
||||
float* restrict y, inc_t incy )
|
||||
{
|
||||
/* Just call the reference implementation. */
|
||||
bli_sssaxpyv_unb_var1( conjx,
|
||||
n,
|
||||
alpha,
|
||||
x, incx,
|
||||
y, incy );
|
||||
BLIS_SAXPYV_KERNEL_REF( conjx,
|
||||
n,
|
||||
alpha,
|
||||
x, incx,
|
||||
y, incy );
|
||||
}
|
||||
|
||||
|
||||
|
||||
void bli_dddaxpyv_opt_var1( conj_t conjx,
|
||||
dim_t n,
|
||||
double* restrict alpha,
|
||||
double* restrict x, inc_t incx,
|
||||
double* restrict y, inc_t incy )
|
||||
void bli_daxpyv_opt_var1( conj_t conjx,
|
||||
dim_t n,
|
||||
double* restrict alpha,
|
||||
double* restrict x, inc_t incx,
|
||||
double* restrict y, inc_t incy )
|
||||
{
|
||||
/* Just call the reference implementation. */
|
||||
bli_dddaxpyv_unb_var1( conjx,
|
||||
n,
|
||||
alpha,
|
||||
x, incx,
|
||||
y, incy );
|
||||
BLIS_DAXPYV_KERNEL_REF( conjx,
|
||||
n,
|
||||
alpha,
|
||||
x, incx,
|
||||
y, incy );
|
||||
}
|
||||
|
||||
|
||||
|
||||
void bli_cccaxpyv_opt_var1( conj_t conjx,
|
||||
dim_t n,
|
||||
scomplex* restrict alpha,
|
||||
scomplex* restrict x, inc_t incx,
|
||||
scomplex* restrict y, inc_t incy )
|
||||
void bli_caxpyv_opt_var1( conj_t conjx,
|
||||
dim_t n,
|
||||
scomplex* restrict alpha,
|
||||
scomplex* restrict x, inc_t incx,
|
||||
scomplex* restrict y, inc_t incy )
|
||||
{
|
||||
/* Just call the reference implementation. */
|
||||
bli_cccaxpyv_unb_var1( conjx,
|
||||
n,
|
||||
alpha,
|
||||
x, incx,
|
||||
y, incy );
|
||||
BLIS_CAXPYV_KERNEL_REF( conjx,
|
||||
n,
|
||||
alpha,
|
||||
x, incx,
|
||||
y, incy );
|
||||
}
|
||||
|
||||
|
||||
|
||||
void bli_zzzaxpyv_opt_var1( conj_t conjx,
|
||||
dim_t n,
|
||||
dcomplex* restrict alpha,
|
||||
dcomplex* restrict x, inc_t incx,
|
||||
dcomplex* restrict y, inc_t incy )
|
||||
void bli_zaxpyv_opt_var1( conj_t conjx,
|
||||
dim_t n,
|
||||
dcomplex* restrict alpha,
|
||||
dcomplex* restrict x, inc_t incx,
|
||||
dcomplex* restrict y, inc_t incy )
|
||||
{
|
||||
/*
|
||||
Template axpyv kernel implementation
|
||||
@@ -193,11 +193,11 @@ void bli_zzzaxpyv_opt_var1( conj_t conjx,
|
||||
// Call the reference implementation if needed.
|
||||
if ( use_ref == TRUE )
|
||||
{
|
||||
bli_zzzaxpyv_unb_var1( conjx,
|
||||
n,
|
||||
alpha,
|
||||
x, incx,
|
||||
y, incy );
|
||||
BLIS_ZAXPYV_KERNEL_REF( conjx,
|
||||
n,
|
||||
alpha,
|
||||
x, incx,
|
||||
y, incy );
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -272,37 +272,3 @@ void bli_zzzaxpyv_opt_var1( conj_t conjx,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
//
|
||||
// Define BLAS-like interfaces with heterogeneous-typed operands.
|
||||
//
|
||||
#undef GENTFUNC3
|
||||
#define GENTFUNC3( ctype_a, ctype_x, ctype_y, cha, chx, chy, opname, varname ) \
|
||||
\
|
||||
void PASTEMAC3(cha,chx,chy,opname)( \
|
||||
conj_t conjx, \
|
||||
dim_t n, \
|
||||
ctype_a* restrict alpha, \
|
||||
ctype_x* restrict x, inc_t incx, \
|
||||
ctype_y* restrict y, inc_t incy \
|
||||
) \
|
||||
{ \
|
||||
/* Just call the reference implementation. */ \
|
||||
PASTEMAC3(cha,chx,chy,varname)( conjx, \
|
||||
n, \
|
||||
alpha, \
|
||||
x, incx, \
|
||||
y, incy ); \
|
||||
}
|
||||
|
||||
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
INSERT_GENTFUNC3_MIX_D( axpyv_opt_var1, axpyv_unb_var1 )
|
||||
#endif
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
INSERT_GENTFUNC3_MIX_P( axpyv_opt_var1, axpyv_unb_var1 )
|
||||
#endif
|
||||
|
||||
|
||||
@@ -36,66 +36,66 @@
|
||||
|
||||
|
||||
|
||||
void bli_sssdotv_opt_var1( conj_t conjx,
|
||||
conj_t conjy,
|
||||
dim_t n,
|
||||
float* restrict x, inc_t incx,
|
||||
float* restrict y, inc_t incy,
|
||||
float* restrict rho )
|
||||
void bli_sdotv_opt_var1( conj_t conjx,
|
||||
conj_t conjy,
|
||||
dim_t n,
|
||||
float* restrict x, inc_t incx,
|
||||
float* restrict y, inc_t incy,
|
||||
float* restrict rho )
|
||||
{
|
||||
/* Just call the reference implementation. */
|
||||
bli_sssdotv_unb_var1( conjx,
|
||||
conjy,
|
||||
n,
|
||||
x, incx,
|
||||
y, incy,
|
||||
rho );
|
||||
BLIS_SDOTV_KERNEL_REF( conjx,
|
||||
conjy,
|
||||
n,
|
||||
x, incx,
|
||||
y, incy,
|
||||
rho );
|
||||
}
|
||||
|
||||
|
||||
|
||||
void bli_ddddotv_opt_var1( conj_t conjx,
|
||||
conj_t conjy,
|
||||
dim_t n,
|
||||
double* restrict x, inc_t incx,
|
||||
double* restrict y, inc_t incy,
|
||||
double* restrict rho )
|
||||
void bli_ddotv_opt_var1( conj_t conjx,
|
||||
conj_t conjy,
|
||||
dim_t n,
|
||||
double* restrict x, inc_t incx,
|
||||
double* restrict y, inc_t incy,
|
||||
double* restrict rho )
|
||||
{
|
||||
/* Just call the reference implementation. */
|
||||
bli_ddddotv_unb_var1( conjx,
|
||||
conjy,
|
||||
n,
|
||||
x, incx,
|
||||
y, incy,
|
||||
rho );
|
||||
BLIS_DDOTV_KERNEL_REF( conjx,
|
||||
conjy,
|
||||
n,
|
||||
x, incx,
|
||||
y, incy,
|
||||
rho );
|
||||
}
|
||||
|
||||
|
||||
|
||||
void bli_cccdotv_opt_var1( conj_t conjx,
|
||||
conj_t conjy,
|
||||
dim_t n,
|
||||
scomplex* restrict x, inc_t incx,
|
||||
scomplex* restrict y, inc_t incy,
|
||||
scomplex* restrict rho )
|
||||
void bli_cdotv_opt_var1( conj_t conjx,
|
||||
conj_t conjy,
|
||||
dim_t n,
|
||||
scomplex* restrict x, inc_t incx,
|
||||
scomplex* restrict y, inc_t incy,
|
||||
scomplex* restrict rho )
|
||||
{
|
||||
/* Just call the reference implementation. */
|
||||
bli_cccdotv_unb_var1( conjx,
|
||||
conjy,
|
||||
n,
|
||||
x, incx,
|
||||
y, incy,
|
||||
rho );
|
||||
BLIS_CDOTV_KERNEL_REF( conjx,
|
||||
conjy,
|
||||
n,
|
||||
x, incx,
|
||||
y, incy,
|
||||
rho );
|
||||
}
|
||||
|
||||
|
||||
|
||||
void bli_zzzdotv_opt_var1( conj_t conjx,
|
||||
conj_t conjy,
|
||||
dim_t n,
|
||||
dcomplex* restrict x, inc_t incx,
|
||||
dcomplex* restrict y, inc_t incy,
|
||||
dcomplex* restrict rho )
|
||||
void bli_zdotv_opt_var1( conj_t conjx,
|
||||
conj_t conjy,
|
||||
dim_t n,
|
||||
dcomplex* restrict x, inc_t incx,
|
||||
dcomplex* restrict y, inc_t incy,
|
||||
dcomplex* restrict rho )
|
||||
{
|
||||
/*
|
||||
Template dotv kernel implementation
|
||||
@@ -210,12 +210,12 @@ void bli_zzzdotv_opt_var1( conj_t conjx,
|
||||
// Call the reference implementation if needed.
|
||||
if ( use_ref == TRUE )
|
||||
{
|
||||
bli_zzzdotv_unb_var1( conjx,
|
||||
conjy,
|
||||
n,
|
||||
x, incx,
|
||||
y, incy,
|
||||
rho );
|
||||
BLIS_ZDOTV_KERNEL_REF( conjx,
|
||||
conjy,
|
||||
n,
|
||||
x, incx,
|
||||
y, incy,
|
||||
rho );
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -310,36 +310,3 @@ void bli_zzzdotv_opt_var1( conj_t conjx,
|
||||
bli_zzcopys( dotxy, *rho );
|
||||
}
|
||||
|
||||
|
||||
|
||||
//
|
||||
// Define BLAS-like interfaces with heterogeneous-typed operands.
|
||||
//
|
||||
#undef GENTFUNC3
|
||||
#define GENTFUNC3( ctype_x, ctype_y, ctype_r, chx, chy, chr, opname, varname ) \
|
||||
\
|
||||
void PASTEMAC3(chx,chy,chr,opname)( \
|
||||
conj_t conjx, \
|
||||
conj_t conjy, \
|
||||
dim_t n, \
|
||||
ctype_x* restrict x, inc_t incx, \
|
||||
ctype_y* restrict y, inc_t incy, \
|
||||
ctype_r* restrict rho \
|
||||
) \
|
||||
{ \
|
||||
/* Just call the reference implementation. */ \
|
||||
PASTEMAC3(chx,chy,chr,varname)( conjx, \
|
||||
conjy, \
|
||||
n, \
|
||||
x, incx, \
|
||||
y, incy, \
|
||||
rho ); \
|
||||
}
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
INSERT_GENTFUNC3_MIX_D( dotv_opt_var1, dotv_unb_var1 )
|
||||
#endif
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
INSERT_GENTFUNC3_MIX_P( dotv_opt_var1, dotv_unb_var1 )
|
||||
#endif
|
||||
|
||||
@@ -1,59 +0,0 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
|
||||
//
|
||||
// Prototype dotv kernel interfaces.
|
||||
//
|
||||
#undef GENTPROT3
|
||||
#define GENTPROT3( ctype_x, ctype_y, ctype_r, chx, chy, chr, varname ) \
|
||||
\
|
||||
void PASTEMAC3(chx,chy,chr,varname)( \
|
||||
conj_t conjx, \
|
||||
conj_t conjy, \
|
||||
dim_t n, \
|
||||
ctype_x* restrict x, inc_t incx, \
|
||||
ctype_y* restrict y, inc_t incy, \
|
||||
ctype_r* restrict rho \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT3_BASIC( dotv_opt_var1 )
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
INSERT_GENTPROT3_MIX_D( dotv_opt_var1 )
|
||||
#endif
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
INSERT_GENTPROT3_MIX_P( dotv_opt_var1 )
|
||||
#endif
|
||||
@@ -36,88 +36,88 @@
|
||||
|
||||
|
||||
|
||||
void bli_sssaxpy2v_opt_var1(
|
||||
conj_t conjx,
|
||||
conj_t conjy,
|
||||
dim_t n,
|
||||
float* restrict alpha1,
|
||||
float* restrict alpha2,
|
||||
float* restrict x, inc_t incx,
|
||||
float* restrict y, inc_t incy,
|
||||
float* restrict z, inc_t incz
|
||||
)
|
||||
void bli_saxpy2v_opt_var1(
|
||||
conj_t conjx,
|
||||
conj_t conjy,
|
||||
dim_t n,
|
||||
float* restrict alpha1,
|
||||
float* restrict alpha2,
|
||||
float* restrict x, inc_t incx,
|
||||
float* restrict y, inc_t incy,
|
||||
float* restrict z, inc_t incz
|
||||
)
|
||||
{
|
||||
/* Just call the reference implementation. */
|
||||
bli_sssaxpy2v_unb_var1( conjx,
|
||||
conjy,
|
||||
n,
|
||||
alpha1,
|
||||
alpha2,
|
||||
x, incx,
|
||||
y, incy,
|
||||
z, incz );
|
||||
BLIS_SAXPY2V_KERNEL_REF( conjx,
|
||||
conjy,
|
||||
n,
|
||||
alpha1,
|
||||
alpha2,
|
||||
x, incx,
|
||||
y, incy,
|
||||
z, incz );
|
||||
}
|
||||
|
||||
|
||||
|
||||
void bli_dddaxpy2v_opt_var1(
|
||||
conj_t conjx,
|
||||
conj_t conjy,
|
||||
dim_t n,
|
||||
double* restrict alpha1,
|
||||
double* restrict alpha2,
|
||||
double* restrict x, inc_t incx,
|
||||
double* restrict y, inc_t incy,
|
||||
double* restrict z, inc_t incz
|
||||
)
|
||||
void bli_daxpy2v_opt_var1(
|
||||
conj_t conjx,
|
||||
conj_t conjy,
|
||||
dim_t n,
|
||||
double* restrict alpha1,
|
||||
double* restrict alpha2,
|
||||
double* restrict x, inc_t incx,
|
||||
double* restrict y, inc_t incy,
|
||||
double* restrict z, inc_t incz
|
||||
)
|
||||
{
|
||||
/* Just call the reference implementation. */
|
||||
bli_dddaxpy2v_unb_var1( conjx,
|
||||
conjy,
|
||||
n,
|
||||
alpha1,
|
||||
alpha2,
|
||||
x, incx,
|
||||
y, incy,
|
||||
z, incz );
|
||||
BLIS_DAXPY2V_KERNEL_REF( conjx,
|
||||
conjy,
|
||||
n,
|
||||
alpha1,
|
||||
alpha2,
|
||||
x, incx,
|
||||
y, incy,
|
||||
z, incz );
|
||||
}
|
||||
|
||||
|
||||
|
||||
void bli_cccaxpy2v_opt_var1(
|
||||
conj_t conjx,
|
||||
conj_t conjy,
|
||||
dim_t n,
|
||||
scomplex* restrict alpha1,
|
||||
scomplex* restrict alpha2,
|
||||
scomplex* restrict x, inc_t incx,
|
||||
scomplex* restrict y, inc_t incy,
|
||||
scomplex* restrict z, inc_t incz
|
||||
)
|
||||
void bli_caxpy2v_opt_var1(
|
||||
conj_t conjx,
|
||||
conj_t conjy,
|
||||
dim_t n,
|
||||
scomplex* restrict alpha1,
|
||||
scomplex* restrict alpha2,
|
||||
scomplex* restrict x, inc_t incx,
|
||||
scomplex* restrict y, inc_t incy,
|
||||
scomplex* restrict z, inc_t incz
|
||||
)
|
||||
{
|
||||
/* Just call the reference implementation. */
|
||||
bli_cccaxpy2v_unb_var1( conjx,
|
||||
conjy,
|
||||
n,
|
||||
alpha1,
|
||||
alpha2,
|
||||
x, incx,
|
||||
y, incy,
|
||||
z, incz );
|
||||
BLIS_CAXPY2V_KERNEL_REF( conjx,
|
||||
conjy,
|
||||
n,
|
||||
alpha1,
|
||||
alpha2,
|
||||
x, incx,
|
||||
y, incy,
|
||||
z, incz );
|
||||
}
|
||||
|
||||
|
||||
|
||||
void bli_zzzaxpy2v_opt_var1(
|
||||
conj_t conjx,
|
||||
conj_t conjy,
|
||||
dim_t n,
|
||||
dcomplex* restrict alpha1,
|
||||
dcomplex* restrict alpha2,
|
||||
dcomplex* restrict x, inc_t incx,
|
||||
dcomplex* restrict y, inc_t incy,
|
||||
dcomplex* restrict z, inc_t incz
|
||||
)
|
||||
void bli_zaxpy2v_opt_var1(
|
||||
conj_t conjx,
|
||||
conj_t conjy,
|
||||
dim_t n,
|
||||
dcomplex* restrict alpha1,
|
||||
dcomplex* restrict alpha2,
|
||||
dcomplex* restrict x, inc_t incx,
|
||||
dcomplex* restrict y, inc_t incy,
|
||||
dcomplex* restrict z, inc_t incz
|
||||
)
|
||||
{
|
||||
/*
|
||||
Template axpy2v kernel implementation
|
||||
@@ -229,14 +229,14 @@ void bli_zzzaxpy2v_opt_var1(
|
||||
// Call the reference implementation if needed.
|
||||
if ( use_ref == TRUE )
|
||||
{
|
||||
bli_zzzaxpy2v_unb_var1( conjx,
|
||||
conjy,
|
||||
n,
|
||||
alpha1,
|
||||
alpha2,
|
||||
x, incx,
|
||||
y, incy,
|
||||
z, incz );
|
||||
BLIS_ZAXPY2V_KERNEL_REF( conjx,
|
||||
conjy,
|
||||
n,
|
||||
alpha1,
|
||||
alpha2,
|
||||
x, incx,
|
||||
y, incy,
|
||||
z, incz );
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -396,41 +396,3 @@ void bli_zzzaxpy2v_opt_var1(
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
//
|
||||
// Define BLAS-like interfaces with heterogeneous-typed operands.
|
||||
//
|
||||
#undef GENTFUNC3U12
|
||||
#define GENTFUNC3U12( ctype_x, ctype_y, ctype_z, ctype_xy, chx, chy, chz, chxy, varname, kername ) \
|
||||
\
|
||||
void PASTEMAC3(chx,chy,chz,varname)( \
|
||||
conj_t conjx, \
|
||||
conj_t conjy, \
|
||||
dim_t n, \
|
||||
ctype_xy* restrict alpha1, \
|
||||
ctype_xy* restrict alpha2, \
|
||||
ctype_x* restrict x, inc_t incx, \
|
||||
ctype_y* restrict y, inc_t incy, \
|
||||
ctype_z* restrict z, inc_t incz \
|
||||
) \
|
||||
{ \
|
||||
/* Just call the reference implementation. */ \
|
||||
PASTEMAC3(chx,chy,chz,kername)( conjx, \
|
||||
conjy, \
|
||||
n, \
|
||||
alpha1, \
|
||||
alpha2, \
|
||||
x, incx, \
|
||||
y, incy, \
|
||||
z, incz ); \
|
||||
}
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
INSERT_GENTFUNC3U12_MIX_D( axpy2v_opt_var1, axpy2v_unb_var1 )
|
||||
#endif
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
INSERT_GENTFUNC3U12_MIX_P( axpy2v_opt_var1, axpy2v_unb_var1 )
|
||||
#endif
|
||||
|
||||
|
||||
@@ -36,87 +36,87 @@
|
||||
|
||||
|
||||
|
||||
void bli_sssaxpyf_opt_var1(
|
||||
conj_t conja,
|
||||
conj_t conjx,
|
||||
dim_t m,
|
||||
dim_t b_n,
|
||||
float* restrict alpha,
|
||||
float* restrict a, inc_t inca, inc_t lda,
|
||||
float* restrict x, inc_t incx,
|
||||
float* restrict y, inc_t incy
|
||||
)
|
||||
void bli_saxpyf_opt_var1(
|
||||
conj_t conja,
|
||||
conj_t conjx,
|
||||
dim_t m,
|
||||
dim_t b_n,
|
||||
float* restrict alpha,
|
||||
float* restrict a, inc_t inca, inc_t lda,
|
||||
float* restrict x, inc_t incx,
|
||||
float* restrict y, inc_t incy
|
||||
)
|
||||
{
|
||||
/* Just call the reference implementation. */
|
||||
bli_sssaxpyf_unb_var1( conja,
|
||||
conjx,
|
||||
m,
|
||||
b_n,
|
||||
alpha,
|
||||
a, inca, lda,
|
||||
x, incx,
|
||||
y, incy );
|
||||
BLIS_SAXPYF_KERNEL_REF( conja,
|
||||
conjx,
|
||||
m,
|
||||
b_n,
|
||||
alpha,
|
||||
a, inca, lda,
|
||||
x, incx,
|
||||
y, incy );
|
||||
}
|
||||
|
||||
|
||||
|
||||
void bli_dddaxpyf_opt_var1(
|
||||
conj_t conja,
|
||||
conj_t conjx,
|
||||
dim_t m,
|
||||
dim_t b_n,
|
||||
double* restrict alpha,
|
||||
double* restrict a, inc_t inca, inc_t lda,
|
||||
double* restrict x, inc_t incx,
|
||||
double* restrict y, inc_t incy
|
||||
)
|
||||
void bli_daxpyf_opt_var1(
|
||||
conj_t conja,
|
||||
conj_t conjx,
|
||||
dim_t m,
|
||||
dim_t b_n,
|
||||
double* restrict alpha,
|
||||
double* restrict a, inc_t inca, inc_t lda,
|
||||
double* restrict x, inc_t incx,
|
||||
double* restrict y, inc_t incy
|
||||
)
|
||||
{
|
||||
/* Just call the reference implementation. */
|
||||
bli_dddaxpyf_unb_var1( conja,
|
||||
conjx,
|
||||
m,
|
||||
b_n,
|
||||
alpha,
|
||||
a, inca, lda,
|
||||
x, incx,
|
||||
y, incy );
|
||||
BLIS_DAXPYF_KERNEL_REF( conja,
|
||||
conjx,
|
||||
m,
|
||||
b_n,
|
||||
alpha,
|
||||
a, inca, lda,
|
||||
x, incx,
|
||||
y, incy );
|
||||
}
|
||||
|
||||
|
||||
|
||||
void bli_cccaxpyf_opt_var1(
|
||||
conj_t conja,
|
||||
conj_t conjx,
|
||||
dim_t m,
|
||||
dim_t b_n,
|
||||
scomplex* restrict alpha,
|
||||
scomplex* restrict a, inc_t inca, inc_t lda,
|
||||
scomplex* restrict x, inc_t incx,
|
||||
scomplex* restrict y, inc_t incy
|
||||
)
|
||||
void bli_caxpyf_opt_var1(
|
||||
conj_t conja,
|
||||
conj_t conjx,
|
||||
dim_t m,
|
||||
dim_t b_n,
|
||||
scomplex* restrict alpha,
|
||||
scomplex* restrict a, inc_t inca, inc_t lda,
|
||||
scomplex* restrict x, inc_t incx,
|
||||
scomplex* restrict y, inc_t incy
|
||||
)
|
||||
{
|
||||
/* Just call the reference implementation. */
|
||||
bli_cccaxpyf_unb_var1( conja,
|
||||
conjx,
|
||||
m,
|
||||
b_n,
|
||||
alpha,
|
||||
a, inca, lda,
|
||||
x, incx,
|
||||
y, incy );
|
||||
BLIS_CAXPYF_KERNEL_REF( conja,
|
||||
conjx,
|
||||
m,
|
||||
b_n,
|
||||
alpha,
|
||||
a, inca, lda,
|
||||
x, incx,
|
||||
y, incy );
|
||||
}
|
||||
|
||||
|
||||
void bli_zzzaxpyf_opt_var1(
|
||||
conj_t conja,
|
||||
conj_t conjx,
|
||||
dim_t m,
|
||||
dim_t b_n,
|
||||
dcomplex* restrict alpha,
|
||||
dcomplex* restrict a, inc_t inca, inc_t lda,
|
||||
dcomplex* restrict x, inc_t incx,
|
||||
dcomplex* restrict y, inc_t incy
|
||||
)
|
||||
void bli_zaxpyf_opt_var1(
|
||||
conj_t conja,
|
||||
conj_t conjx,
|
||||
dim_t m,
|
||||
dim_t b_n,
|
||||
dcomplex* restrict alpha,
|
||||
dcomplex* restrict a, inc_t inca, inc_t lda,
|
||||
dcomplex* restrict x, inc_t incx,
|
||||
dcomplex* restrict y, inc_t incy
|
||||
)
|
||||
{
|
||||
/*
|
||||
Template axpyf kernel implementation
|
||||
@@ -243,14 +243,14 @@ void bli_zzzaxpyf_opt_var1(
|
||||
// Call the reference implementation if needed.
|
||||
if ( use_ref == TRUE )
|
||||
{
|
||||
bli_zzzaxpyf_unb_var1( conja,
|
||||
conjx,
|
||||
m,
|
||||
b_n,
|
||||
alpha,
|
||||
a, inca, lda,
|
||||
x, incx,
|
||||
y, incy );
|
||||
BLIS_ZAXPYF_KERNEL_REF( conja,
|
||||
conjx,
|
||||
m,
|
||||
b_n,
|
||||
alpha,
|
||||
a, inca, lda,
|
||||
x, incx,
|
||||
y, incy );
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -376,41 +376,3 @@ void bli_zzzaxpyf_opt_var1(
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
//
|
||||
// Define BLAS-like interfaces with heterogeneous-typed operands.
|
||||
//
|
||||
#undef GENTFUNC3U12
|
||||
#define GENTFUNC3U12( ctype_a, ctype_x, ctype_y, ctype_ax, cha, chx, chy, chax, varname, kername ) \
|
||||
\
|
||||
void PASTEMAC3(cha,chx,chy,varname)( \
|
||||
conj_t conja, \
|
||||
conj_t conjx, \
|
||||
dim_t m, \
|
||||
dim_t b_n, \
|
||||
ctype_ax* restrict alpha, \
|
||||
ctype_a* restrict a, inc_t inca, inc_t lda, \
|
||||
ctype_x* restrict x, inc_t incx, \
|
||||
ctype_y* restrict y, inc_t incy \
|
||||
) \
|
||||
{ \
|
||||
/* Just call the reference implementation. */ \
|
||||
PASTEMAC3(cha,chx,chy,kername)( conja, \
|
||||
conjx, \
|
||||
m, \
|
||||
b_n, \
|
||||
alpha, \
|
||||
a, inca, lda, \
|
||||
x, incx, \
|
||||
y, incy ); \
|
||||
}
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
INSERT_GENTFUNC3U12_MIX_D( axpyf_opt_var1, axpyf_unb_var1 )
|
||||
#endif
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
INSERT_GENTFUNC3U12_MIX_P( axpyf_opt_var1, axpyf_unb_var1 )
|
||||
#endif
|
||||
|
||||
|
||||
@@ -36,87 +36,87 @@
|
||||
|
||||
|
||||
|
||||
void bli_sssdotaxpyv_opt_var1( conj_t conjxt,
|
||||
conj_t conjx,
|
||||
conj_t conjy,
|
||||
dim_t n,
|
||||
float* restrict alpha,
|
||||
float* restrict x, inc_t incx,
|
||||
float* restrict y, inc_t incy,
|
||||
float* restrict rho,
|
||||
float* restrict z, inc_t incz )
|
||||
void bli_sdotaxpyv_opt_var1( conj_t conjxt,
|
||||
conj_t conjx,
|
||||
conj_t conjy,
|
||||
dim_t n,
|
||||
float* restrict alpha,
|
||||
float* restrict x, inc_t incx,
|
||||
float* restrict y, inc_t incy,
|
||||
float* restrict rho,
|
||||
float* restrict z, inc_t incz )
|
||||
{
|
||||
/* Just call the reference implementation. */
|
||||
bli_sssdotaxpyv_unb_var1( conjxt,
|
||||
conjx,
|
||||
conjy,
|
||||
n,
|
||||
alpha,
|
||||
x, incx,
|
||||
y, incy,
|
||||
rho,
|
||||
z, incz );
|
||||
BLIS_SDOTAXPYV_KERNEL_REF( conjxt,
|
||||
conjx,
|
||||
conjy,
|
||||
n,
|
||||
alpha,
|
||||
x, incx,
|
||||
y, incy,
|
||||
rho,
|
||||
z, incz );
|
||||
}
|
||||
|
||||
|
||||
|
||||
void bli_ddddotaxpyv_opt_var1( conj_t conjxt,
|
||||
conj_t conjx,
|
||||
conj_t conjy,
|
||||
dim_t n,
|
||||
double* restrict alpha,
|
||||
double* restrict x, inc_t incx,
|
||||
double* restrict y, inc_t incy,
|
||||
double* restrict rho,
|
||||
double* restrict z, inc_t incz )
|
||||
void bli_ddotaxpyv_opt_var1( conj_t conjxt,
|
||||
conj_t conjx,
|
||||
conj_t conjy,
|
||||
dim_t n,
|
||||
double* restrict alpha,
|
||||
double* restrict x, inc_t incx,
|
||||
double* restrict y, inc_t incy,
|
||||
double* restrict rho,
|
||||
double* restrict z, inc_t incz )
|
||||
{
|
||||
/* Just call the reference implementation. */
|
||||
bli_ddddotaxpyv_unb_var1( conjxt,
|
||||
conjx,
|
||||
conjy,
|
||||
n,
|
||||
alpha,
|
||||
x, incx,
|
||||
y, incy,
|
||||
rho,
|
||||
z, incz );
|
||||
BLIS_DDOTAXPYV_KERNEL_REF( conjxt,
|
||||
conjx,
|
||||
conjy,
|
||||
n,
|
||||
alpha,
|
||||
x, incx,
|
||||
y, incy,
|
||||
rho,
|
||||
z, incz );
|
||||
}
|
||||
|
||||
|
||||
|
||||
void bli_cccdotaxpyv_opt_var1( conj_t conjxt,
|
||||
conj_t conjx,
|
||||
conj_t conjy,
|
||||
dim_t n,
|
||||
scomplex* restrict alpha,
|
||||
scomplex* restrict x, inc_t incx,
|
||||
scomplex* restrict y, inc_t incy,
|
||||
scomplex* restrict rho,
|
||||
scomplex* restrict z, inc_t incz )
|
||||
void bli_cdotaxpyv_opt_var1( conj_t conjxt,
|
||||
conj_t conjx,
|
||||
conj_t conjy,
|
||||
dim_t n,
|
||||
scomplex* restrict alpha,
|
||||
scomplex* restrict x, inc_t incx,
|
||||
scomplex* restrict y, inc_t incy,
|
||||
scomplex* restrict rho,
|
||||
scomplex* restrict z, inc_t incz )
|
||||
{
|
||||
/* Just call the reference implementation. */
|
||||
bli_cccdotaxpyv_unb_var1( conjxt,
|
||||
conjx,
|
||||
conjy,
|
||||
n,
|
||||
alpha,
|
||||
x, incx,
|
||||
y, incy,
|
||||
rho,
|
||||
z, incz );
|
||||
BLIS_CDOTAXPYV_KERNEL_REF( conjxt,
|
||||
conjx,
|
||||
conjy,
|
||||
n,
|
||||
alpha,
|
||||
x, incx,
|
||||
y, incy,
|
||||
rho,
|
||||
z, incz );
|
||||
}
|
||||
|
||||
|
||||
|
||||
void bli_zzzdotaxpyv_opt_var1( conj_t conjxt,
|
||||
conj_t conjx,
|
||||
conj_t conjy,
|
||||
dim_t n,
|
||||
dcomplex* restrict alpha,
|
||||
dcomplex* restrict x, inc_t incx,
|
||||
dcomplex* restrict y, inc_t incy,
|
||||
dcomplex* restrict rho,
|
||||
dcomplex* restrict z, inc_t incz )
|
||||
void bli_zdotaxpyv_opt_var1( conj_t conjxt,
|
||||
conj_t conjx,
|
||||
conj_t conjy,
|
||||
dim_t n,
|
||||
dcomplex* restrict alpha,
|
||||
dcomplex* restrict x, inc_t incx,
|
||||
dcomplex* restrict y, inc_t incy,
|
||||
dcomplex* restrict rho,
|
||||
dcomplex* restrict z, inc_t incz )
|
||||
{
|
||||
/*
|
||||
Template dotaxpyv kernel implementation
|
||||
@@ -240,15 +240,15 @@ void bli_zzzdotaxpyv_opt_var1( conj_t conjxt,
|
||||
// Call the reference implementation if needed.
|
||||
if ( use_ref == TRUE )
|
||||
{
|
||||
bli_zzzdotaxpyv_unb_var1( conjxt,
|
||||
conjx,
|
||||
conjy,
|
||||
n,
|
||||
alpha,
|
||||
x, incx,
|
||||
y, incy,
|
||||
rho,
|
||||
z, incz );
|
||||
BLIS_ZDOTAXPYV_KERNEL_REF( conjxt,
|
||||
conjx,
|
||||
conjy,
|
||||
n,
|
||||
alpha,
|
||||
x, incx,
|
||||
y, incy,
|
||||
rho,
|
||||
z, incz );
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -429,42 +429,3 @@ void bli_zzzdotaxpyv_opt_var1( conj_t conjxt,
|
||||
bli_zzcopys( dotxy, *rho );
|
||||
}
|
||||
|
||||
|
||||
//
|
||||
// Define BLAS-like interfaces with heterogeneous-typed operands.
|
||||
//
|
||||
#undef GENTFUNC3U12
|
||||
#define GENTFUNC3U12( ctype_x, ctype_y, ctype_z, ctype_xy, chx, chy, chz, chxy, varname, kername ) \
|
||||
\
|
||||
void PASTEMAC3(chx,chy,chz,varname)( \
|
||||
conj_t conjxt, \
|
||||
conj_t conjx, \
|
||||
conj_t conjy, \
|
||||
dim_t n, \
|
||||
ctype_x* restrict alpha, \
|
||||
ctype_x* restrict x, inc_t incx, \
|
||||
ctype_y* restrict y, inc_t incy, \
|
||||
ctype_xy* restrict rho, \
|
||||
ctype_z* restrict z, inc_t incz \
|
||||
) \
|
||||
{ \
|
||||
/* Just call the reference implementation. */ \
|
||||
PASTEMAC3(chx,chy,chz,kername)( conjxt, \
|
||||
conjx, \
|
||||
conjy, \
|
||||
n, \
|
||||
alpha, \
|
||||
x, incx, \
|
||||
y, incy, \
|
||||
rho, \
|
||||
z, incz ); \
|
||||
}
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
INSERT_GENTFUNC3U12_MIX_D( dotaxpyv_opt_var1, dotaxpyv_unb_var1 )
|
||||
#endif
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
INSERT_GENTFUNC3U12_MIX_P( dotaxpyv_opt_var1, dotaxpyv_unb_var1 )
|
||||
#endif
|
||||
|
||||
|
||||
@@ -36,115 +36,115 @@
|
||||
|
||||
|
||||
|
||||
void bli_sssdotxaxpyf_opt_var1( conj_t conjat,
|
||||
conj_t conja,
|
||||
conj_t conjw,
|
||||
conj_t conjx,
|
||||
dim_t m,
|
||||
dim_t b_n,
|
||||
float* restrict alpha,
|
||||
float* restrict a, inc_t inca, inc_t lda,
|
||||
float* restrict w, inc_t incw,
|
||||
float* restrict x, inc_t incx,
|
||||
float* restrict beta,
|
||||
float* restrict y, inc_t incy,
|
||||
float* restrict z, inc_t incz )
|
||||
void bli_sdotxaxpyf_opt_var1( conj_t conjat,
|
||||
conj_t conja,
|
||||
conj_t conjw,
|
||||
conj_t conjx,
|
||||
dim_t m,
|
||||
dim_t b_n,
|
||||
float* restrict alpha,
|
||||
float* restrict a, inc_t inca, inc_t lda,
|
||||
float* restrict w, inc_t incw,
|
||||
float* restrict x, inc_t incx,
|
||||
float* restrict beta,
|
||||
float* restrict y, inc_t incy,
|
||||
float* restrict z, inc_t incz )
|
||||
{
|
||||
/* Just call the reference implementation. */
|
||||
bli_sssdotxaxpyf_unb_var1( conjat,
|
||||
conja,
|
||||
conjw,
|
||||
conjx,
|
||||
m,
|
||||
b_n,
|
||||
alpha,
|
||||
a, inca, lda,
|
||||
w, incw,
|
||||
x, incx,
|
||||
beta,
|
||||
y, incy,
|
||||
z, incz );
|
||||
BLIS_SDOTXAXPYF_KERNEL_REF( conjat,
|
||||
conja,
|
||||
conjw,
|
||||
conjx,
|
||||
m,
|
||||
b_n,
|
||||
alpha,
|
||||
a, inca, lda,
|
||||
w, incw,
|
||||
x, incx,
|
||||
beta,
|
||||
y, incy,
|
||||
z, incz );
|
||||
}
|
||||
|
||||
|
||||
|
||||
void bli_ddddotxaxpyf_opt_var1( conj_t conjat,
|
||||
conj_t conja,
|
||||
conj_t conjw,
|
||||
conj_t conjx,
|
||||
dim_t m,
|
||||
dim_t b_n,
|
||||
double* restrict alpha,
|
||||
double* restrict a, inc_t inca, inc_t lda,
|
||||
double* restrict w, inc_t incw,
|
||||
double* restrict x, inc_t incx,
|
||||
double* restrict beta,
|
||||
double* restrict y, inc_t incy,
|
||||
double* restrict z, inc_t incz )
|
||||
void bli_ddotxaxpyf_opt_var1( conj_t conjat,
|
||||
conj_t conja,
|
||||
conj_t conjw,
|
||||
conj_t conjx,
|
||||
dim_t m,
|
||||
dim_t b_n,
|
||||
double* restrict alpha,
|
||||
double* restrict a, inc_t inca, inc_t lda,
|
||||
double* restrict w, inc_t incw,
|
||||
double* restrict x, inc_t incx,
|
||||
double* restrict beta,
|
||||
double* restrict y, inc_t incy,
|
||||
double* restrict z, inc_t incz )
|
||||
{
|
||||
/* Just call the reference implementation. */
|
||||
bli_ddddotxaxpyf_unb_var1( conjat,
|
||||
conja,
|
||||
conjw,
|
||||
conjx,
|
||||
m,
|
||||
b_n,
|
||||
alpha,
|
||||
a, inca, lda,
|
||||
w, incw,
|
||||
x, incx,
|
||||
beta,
|
||||
y, incy,
|
||||
z, incz );
|
||||
BLIS_DDOTXAXPYF_KERNEL_REF( conjat,
|
||||
conja,
|
||||
conjw,
|
||||
conjx,
|
||||
m,
|
||||
b_n,
|
||||
alpha,
|
||||
a, inca, lda,
|
||||
w, incw,
|
||||
x, incx,
|
||||
beta,
|
||||
y, incy,
|
||||
z, incz );
|
||||
}
|
||||
|
||||
|
||||
|
||||
void bli_cccdotxaxpyf_opt_var1( conj_t conjat,
|
||||
conj_t conja,
|
||||
conj_t conjw,
|
||||
conj_t conjx,
|
||||
dim_t m,
|
||||
dim_t b_n,
|
||||
scomplex* restrict alpha,
|
||||
scomplex* restrict a, inc_t inca, inc_t lda,
|
||||
scomplex* restrict w, inc_t incw,
|
||||
scomplex* restrict x, inc_t incx,
|
||||
scomplex* restrict beta,
|
||||
scomplex* restrict y, inc_t incy,
|
||||
scomplex* restrict z, inc_t incz )
|
||||
void bli_cdotxaxpyf_opt_var1( conj_t conjat,
|
||||
conj_t conja,
|
||||
conj_t conjw,
|
||||
conj_t conjx,
|
||||
dim_t m,
|
||||
dim_t b_n,
|
||||
scomplex* restrict alpha,
|
||||
scomplex* restrict a, inc_t inca, inc_t lda,
|
||||
scomplex* restrict w, inc_t incw,
|
||||
scomplex* restrict x, inc_t incx,
|
||||
scomplex* restrict beta,
|
||||
scomplex* restrict y, inc_t incy,
|
||||
scomplex* restrict z, inc_t incz )
|
||||
{
|
||||
/* Just call the reference implementation. */
|
||||
bli_cccdotxaxpyf_unb_var1( conjat,
|
||||
conja,
|
||||
conjw,
|
||||
conjx,
|
||||
m,
|
||||
b_n,
|
||||
alpha,
|
||||
a, inca, lda,
|
||||
w, incw,
|
||||
x, incx,
|
||||
beta,
|
||||
y, incy,
|
||||
z, incz );
|
||||
BLIS_CDOTXAXPYF_KERNEL_REF( conjat,
|
||||
conja,
|
||||
conjw,
|
||||
conjx,
|
||||
m,
|
||||
b_n,
|
||||
alpha,
|
||||
a, inca, lda,
|
||||
w, incw,
|
||||
x, incx,
|
||||
beta,
|
||||
y, incy,
|
||||
z, incz );
|
||||
}
|
||||
|
||||
|
||||
|
||||
void bli_zzzdotxaxpyf_opt_var1( conj_t conjat,
|
||||
conj_t conja,
|
||||
conj_t conjw,
|
||||
conj_t conjx,
|
||||
dim_t m,
|
||||
dim_t b_n,
|
||||
dcomplex* restrict alpha,
|
||||
dcomplex* restrict a, inc_t inca, inc_t lda,
|
||||
dcomplex* restrict w, inc_t incw,
|
||||
dcomplex* restrict x, inc_t incx,
|
||||
dcomplex* restrict beta,
|
||||
dcomplex* restrict y, inc_t incy,
|
||||
dcomplex* restrict z, inc_t incz )
|
||||
void bli_zdotxaxpyf_opt_var1( conj_t conjat,
|
||||
conj_t conja,
|
||||
conj_t conjw,
|
||||
conj_t conjx,
|
||||
dim_t m,
|
||||
dim_t b_n,
|
||||
dcomplex* restrict alpha,
|
||||
dcomplex* restrict a, inc_t inca, inc_t lda,
|
||||
dcomplex* restrict w, inc_t incw,
|
||||
dcomplex* restrict x, inc_t incx,
|
||||
dcomplex* restrict beta,
|
||||
dcomplex* restrict y, inc_t incy,
|
||||
dcomplex* restrict z, inc_t incz )
|
||||
|
||||
{
|
||||
/*
|
||||
@@ -289,19 +289,19 @@ void bli_zzzdotxaxpyf_opt_var1( conj_t conjat,
|
||||
// Call the reference implementation if needed.
|
||||
if ( use_ref == TRUE )
|
||||
{
|
||||
bli_zzzdotxaxpyf_unb_var1( conjat,
|
||||
conja,
|
||||
conjw,
|
||||
conjx,
|
||||
m,
|
||||
b_n,
|
||||
alpha,
|
||||
a, inca, lda,
|
||||
w, incw,
|
||||
x, incx,
|
||||
beta,
|
||||
y, incy,
|
||||
z, incz );
|
||||
BLIS_ZDOTXAXPYF_KERNEL_REF( conjat,
|
||||
conja,
|
||||
conjw,
|
||||
conjx,
|
||||
m,
|
||||
b_n,
|
||||
alpha,
|
||||
a, inca, lda,
|
||||
w, incw,
|
||||
x, incx,
|
||||
beta,
|
||||
y, incy,
|
||||
z, incz );
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -560,51 +560,3 @@ void bli_zzzdotxaxpyf_opt_var1( conj_t conjat,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
//
|
||||
// Define BLAS-like interfaces with heterogeneous-typed operands.
|
||||
//
|
||||
#undef GENTFUNC3U12
|
||||
#define GENTFUNC3U12( ctype_a, ctype_b, ctype_c, ctype_ab, cha, chb, chc, chab, varname, kername ) \
|
||||
\
|
||||
void PASTEMAC3(cha,chb,chc,varname)( \
|
||||
conj_t conjat, \
|
||||
conj_t conja, \
|
||||
conj_t conjw, \
|
||||
conj_t conjx, \
|
||||
dim_t m, \
|
||||
dim_t b_n, \
|
||||
ctype_ab* restrict alpha, \
|
||||
ctype_a* restrict a, inc_t inca, inc_t lda, \
|
||||
ctype_b* restrict w, inc_t incw, \
|
||||
ctype_b* restrict x, inc_t incx, \
|
||||
ctype_c* restrict beta, \
|
||||
ctype_c* restrict y, inc_t incy, \
|
||||
ctype_c* restrict z, inc_t incz \
|
||||
) \
|
||||
{ \
|
||||
/* Just call the reference implementation. */ \
|
||||
PASTEMAC3(cha,chx,chy,kername)( conjat, \
|
||||
conja, \
|
||||
conjw, \
|
||||
conjx, \
|
||||
m, \
|
||||
b_n, \
|
||||
alpha, \
|
||||
a, inca, lda, \
|
||||
w, incw, \
|
||||
x, incx, \
|
||||
beta, \
|
||||
y, incy, \
|
||||
z, incz ); \
|
||||
}
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
INSERT_GENTFUNC3U12_MIX_D( dotxaxpyf_opt_var1, dotxaxpyf_unb_var1 )
|
||||
#endif
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
INSERT_GENTFUNC3U12_MIX_P( dotxaxpyf_opt_var1, dotxaxpyf_unb_var1 )
|
||||
#endif
|
||||
|
||||
|
||||
@@ -1,67 +0,0 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
|
||||
//
|
||||
// Prototype kernel interfaces.
|
||||
//
|
||||
#undef GENTPROT3U12
|
||||
#define GENTPROT3U12( ctype_a, ctype_b, ctype_c, ctype_ab, cha, chb, chc, chab, varname ) \
|
||||
\
|
||||
void PASTEMAC3(cha,chb,chc,varname)( \
|
||||
conj_t conjat, \
|
||||
conj_t conja, \
|
||||
conj_t conjw, \
|
||||
conj_t conjx, \
|
||||
dim_t m, \
|
||||
dim_t b_n, \
|
||||
ctype_ab* restrict alpha, \
|
||||
ctype_a* restrict a, inc_t inca, inc_t lda, \
|
||||
ctype_b* restrict w, inc_t incw, \
|
||||
ctype_b* restrict x, inc_t incx, \
|
||||
ctype_c* restrict beta, \
|
||||
ctype_c* restrict y, inc_t incy, \
|
||||
ctype_c* restrict z, inc_t incz \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT3U12_BASIC( dotxaxpyf_opt_var1 )
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
INSERT_GENTPROT3U12_MIX_D( dotxaxpyf_opt_var1 )
|
||||
#endif
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
INSERT_GENTPROT3U12_MIX_P( dotxaxpyf_opt_var1 )
|
||||
#endif
|
||||
|
||||
@@ -36,95 +36,95 @@
|
||||
|
||||
|
||||
|
||||
void bli_sssdotxf_opt_var1(
|
||||
conj_t conjat,
|
||||
conj_t conjx,
|
||||
dim_t m,
|
||||
dim_t b_n,
|
||||
float* restrict alpha,
|
||||
float* restrict a, inc_t inca, inc_t lda,
|
||||
float* restrict x, inc_t incx,
|
||||
float* restrict beta,
|
||||
float* restrict y, inc_t incy
|
||||
)
|
||||
void bli_sdotxf_opt_var1(
|
||||
conj_t conjat,
|
||||
conj_t conjx,
|
||||
dim_t m,
|
||||
dim_t b_n,
|
||||
float* restrict alpha,
|
||||
float* restrict a, inc_t inca, inc_t lda,
|
||||
float* restrict x, inc_t incx,
|
||||
float* restrict beta,
|
||||
float* restrict y, inc_t incy
|
||||
)
|
||||
{
|
||||
/* Just call the reference implementation. */
|
||||
bli_sssdotxf_unb_var1( conjat,
|
||||
conjx,
|
||||
m,
|
||||
b_n,
|
||||
alpha,
|
||||
a, inca, lda,
|
||||
x, incx,
|
||||
beta,
|
||||
y, incy );
|
||||
BLIS_SDOTXF_KERNEL_REF( conjat,
|
||||
conjx,
|
||||
m,
|
||||
b_n,
|
||||
alpha,
|
||||
a, inca, lda,
|
||||
x, incx,
|
||||
beta,
|
||||
y, incy );
|
||||
}
|
||||
|
||||
|
||||
|
||||
void bli_ddddotxf_opt_var1(
|
||||
conj_t conjat,
|
||||
conj_t conjx,
|
||||
dim_t m,
|
||||
dim_t b_n,
|
||||
double* restrict alpha,
|
||||
double* restrict a, inc_t inca, inc_t lda,
|
||||
double* restrict x, inc_t incx,
|
||||
double* restrict beta,
|
||||
double* restrict y, inc_t incy
|
||||
)
|
||||
void bli_ddotxf_opt_var1(
|
||||
conj_t conjat,
|
||||
conj_t conjx,
|
||||
dim_t m,
|
||||
dim_t b_n,
|
||||
double* restrict alpha,
|
||||
double* restrict a, inc_t inca, inc_t lda,
|
||||
double* restrict x, inc_t incx,
|
||||
double* restrict beta,
|
||||
double* restrict y, inc_t incy
|
||||
)
|
||||
{
|
||||
/* Just call the reference implementation. */
|
||||
bli_ddddotxf_unb_var1( conjat,
|
||||
conjx,
|
||||
m,
|
||||
b_n,
|
||||
alpha,
|
||||
a, inca, lda,
|
||||
x, incx,
|
||||
beta,
|
||||
y, incy );
|
||||
BLIS_DDOTXF_KERNEL_REF( conjat,
|
||||
conjx,
|
||||
m,
|
||||
b_n,
|
||||
alpha,
|
||||
a, inca, lda,
|
||||
x, incx,
|
||||
beta,
|
||||
y, incy );
|
||||
}
|
||||
|
||||
|
||||
|
||||
void bli_cccdotxf_opt_var1(
|
||||
conj_t conjat,
|
||||
conj_t conjx,
|
||||
dim_t m,
|
||||
dim_t b_n,
|
||||
scomplex* restrict alpha,
|
||||
scomplex* restrict a, inc_t inca, inc_t lda,
|
||||
scomplex* restrict x, inc_t incx,
|
||||
scomplex* restrict beta,
|
||||
scomplex* restrict y, inc_t incy
|
||||
)
|
||||
void bli_cdotxf_opt_var1(
|
||||
conj_t conjat,
|
||||
conj_t conjx,
|
||||
dim_t m,
|
||||
dim_t b_n,
|
||||
scomplex* restrict alpha,
|
||||
scomplex* restrict a, inc_t inca, inc_t lda,
|
||||
scomplex* restrict x, inc_t incx,
|
||||
scomplex* restrict beta,
|
||||
scomplex* restrict y, inc_t incy
|
||||
)
|
||||
{
|
||||
/* Just call the reference implementation. */
|
||||
bli_cccdotxf_unb_var1( conjat,
|
||||
conjx,
|
||||
m,
|
||||
b_n,
|
||||
alpha,
|
||||
a, inca, lda,
|
||||
x, incx,
|
||||
beta,
|
||||
y, incy );
|
||||
BLIS_CDOTXF_KERNEL_REF( conjat,
|
||||
conjx,
|
||||
m,
|
||||
b_n,
|
||||
alpha,
|
||||
a, inca, lda,
|
||||
x, incx,
|
||||
beta,
|
||||
y, incy );
|
||||
}
|
||||
|
||||
|
||||
|
||||
void bli_zzzdotxf_opt_var1(
|
||||
conj_t conjat,
|
||||
conj_t conjx,
|
||||
dim_t m,
|
||||
dim_t b_n,
|
||||
dcomplex* restrict alpha,
|
||||
dcomplex* restrict a, inc_t inca, inc_t lda,
|
||||
dcomplex* restrict x, inc_t incx,
|
||||
dcomplex* restrict beta,
|
||||
dcomplex* restrict y, inc_t incy
|
||||
)
|
||||
void bli_zdotxf_opt_var1(
|
||||
conj_t conjat,
|
||||
conj_t conjx,
|
||||
dim_t m,
|
||||
dim_t b_n,
|
||||
dcomplex* restrict alpha,
|
||||
dcomplex* restrict a, inc_t inca, inc_t lda,
|
||||
dcomplex* restrict x, inc_t incx,
|
||||
dcomplex* restrict beta,
|
||||
dcomplex* restrict y, inc_t incy
|
||||
)
|
||||
{
|
||||
/*
|
||||
Template dotxf kernel implementation
|
||||
@@ -265,15 +265,15 @@ void bli_zzzdotxf_opt_var1(
|
||||
// Call the reference implementation if needed.
|
||||
if ( use_ref == TRUE )
|
||||
{
|
||||
bli_zzzdotxf_unb_var1( conjat,
|
||||
conjx,
|
||||
m,
|
||||
b_n,
|
||||
alpha,
|
||||
a, inca, lda,
|
||||
x, incx,
|
||||
beta,
|
||||
y, incy );
|
||||
BLIS_ZDOTXF_KERNEL_REF( conjat,
|
||||
conjx,
|
||||
m,
|
||||
b_n,
|
||||
alpha,
|
||||
a, inca, lda,
|
||||
x, incx,
|
||||
beta,
|
||||
y, incy );
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -414,43 +414,3 @@ void bli_zzzdotxf_opt_var1(
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
//
|
||||
// Define BLAS-like interfaces with heterogeneous-typed operands.
|
||||
//
|
||||
#undef GENTFUNC3U12
|
||||
#define GENTFUNC3U12( ctype_x, ctype_y, ctype_r, ctype_xy, chx, chy, chr, chxy, varname, kername ) \
|
||||
\
|
||||
void PASTEMAC3(chx,chy,chr,varname)( \
|
||||
conj_t conjat, \
|
||||
conj_t conjx, \
|
||||
dim_t m, \
|
||||
dim_t b_n, \
|
||||
ctype_xy* restrict alpha, \
|
||||
ctype_x* restrict a, inc_t inca, inc_t lda, \
|
||||
ctype_y* restrict x, inc_t incx, \
|
||||
ctype_r* restrict beta, \
|
||||
ctype_r* restrict y, inc_t incy \
|
||||
) \
|
||||
{ \
|
||||
/* Just call the reference implementation. */ \
|
||||
PASTEMAC3(cha,chx,chy,kername)( conjat, \
|
||||
conjx, \
|
||||
m, \
|
||||
b_n, \
|
||||
alpha, \
|
||||
a, inca, lda, \
|
||||
x, incx, \
|
||||
beta, \
|
||||
y, incy ); \
|
||||
}
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
INSERT_GENTFUNC3U12_MIX_D( dotxf_opt_var1, dotxf_unb_var1 )
|
||||
#endif
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
INSERT_GENTFUNC3U12_MIX_P( dotxf_opt_var1, dotxf_unb_var1 )
|
||||
#endif
|
||||
|
||||
|
||||
@@ -1,63 +0,0 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
|
||||
//
|
||||
// Prototype kernel interfaces.
|
||||
//
|
||||
#undef GENTPROT3U12
|
||||
#define GENTPROT3U12( ctype_a, ctype_x, ctype_y, ctype_ax, cha, chx, chy, chax, varname ) \
|
||||
\
|
||||
void PASTEMAC3(cha,chx,chy,varname)( \
|
||||
conj_t conjat, \
|
||||
conj_t conjx, \
|
||||
dim_t m, \
|
||||
dim_t b_n, \
|
||||
ctype_ax* restrict alpha, \
|
||||
ctype_a* restrict a, inc_t inca, inc_t lda, \
|
||||
ctype_x* restrict x, inc_t incx, \
|
||||
ctype_y* restrict beta, \
|
||||
ctype_y* restrict y, inc_t incy \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT3U12_BASIC( dotxf_opt_var1 )
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
INSERT_GENTPROT3U12_MIX_D( dotxf_opt_var1 )
|
||||
#endif
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
INSERT_GENTPROT3U12_MIX_P( dotxf_opt_var1 )
|
||||
#endif
|
||||
|
||||
@@ -47,7 +47,7 @@ void bli_sgemm_opt_mxn(
|
||||
)
|
||||
{
|
||||
/* Just call the reference implementation. */
|
||||
bli_sgemm_ref_mxn( k,
|
||||
BLIS_SGEMM_UKERNEL_REF( k,
|
||||
alpha,
|
||||
a1,
|
||||
b1,
|
||||
@@ -275,7 +275,7 @@ void bli_cgemm_opt_mxn(
|
||||
)
|
||||
{
|
||||
/* Just call the reference implementation. */
|
||||
bli_cgemm_ref_mxn( k,
|
||||
BLIS_CGEMM_UKERNEL_REF( k,
|
||||
alpha,
|
||||
a1,
|
||||
b1,
|
||||
@@ -297,7 +297,7 @@ void bli_zgemm_opt_mxn(
|
||||
)
|
||||
{
|
||||
/* Just call the reference implementation. */
|
||||
bli_zgemm_ref_mxn( k,
|
||||
BLIS_ZGEMM_UKERNEL_REF( k,
|
||||
alpha,
|
||||
a1,
|
||||
b1,
|
||||
|
||||
@@ -1,53 +0,0 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
|
||||
//
|
||||
// Prototype micro-kernel interfaces.
|
||||
//
|
||||
#undef GENTPROT
|
||||
#define GENTPROT( ctype, ch, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
dim_t k, \
|
||||
ctype* restrict alpha, \
|
||||
ctype* restrict a1, \
|
||||
ctype* restrict b1, \
|
||||
ctype* restrict beta, \
|
||||
ctype* restrict c11, inc_t rs_c, inc_t cs_c, \
|
||||
auxinfo_t* data \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT_BASIC( gemm_opt_mxn )
|
||||
|
||||
@@ -1,54 +0,0 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
|
||||
//
|
||||
// Prototype micro-kernel interfaces.
|
||||
//
|
||||
#undef GENTPROT
|
||||
#define GENTPROT( ctype, ch, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
dim_t k, \
|
||||
ctype* restrict alpha, \
|
||||
ctype* restrict a10, \
|
||||
ctype* restrict a11, \
|
||||
ctype* restrict b01, \
|
||||
ctype* restrict b11, \
|
||||
ctype* restrict c11, inc_t rs_c, inc_t cs_c, \
|
||||
auxinfo_t* data \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT_BASIC( gemmtrsm_l_opt_mxn )
|
||||
|
||||
@@ -44,7 +44,7 @@ void bli_strsm_l_opt_mxn(
|
||||
)
|
||||
{
|
||||
/* Just call the reference implementation. */
|
||||
bli_strsm_l_ref_mxn( a11,
|
||||
BLIS_STRSM_L_UKERNEL_REF( a11,
|
||||
b11,
|
||||
c11, rs_c, cs_c,
|
||||
data );
|
||||
@@ -216,7 +216,7 @@ void bli_ctrsm_l_opt_mxn(
|
||||
)
|
||||
{
|
||||
/* Just call the reference implementation. */
|
||||
bli_ctrsm_l_ref_mxn( a11,
|
||||
BLIS_CTRSM_L_UKERNEL_REF( a11,
|
||||
b11,
|
||||
c11, rs_c, cs_c,
|
||||
data );
|
||||
@@ -232,7 +232,7 @@ void bli_ztrsm_l_opt_mxn(
|
||||
)
|
||||
{
|
||||
/* Just call the reference implementation. */
|
||||
bli_ztrsm_l_ref_mxn( a11,
|
||||
BLIS_ZTRSM_L_UKERNEL_REF( a11,
|
||||
b11,
|
||||
c11, rs_c, cs_c,
|
||||
data );
|
||||
|
||||
@@ -37,25 +37,25 @@
|
||||
|
||||
|
||||
void bli_strsm_u_opt_mxn(
|
||||
float* restrict a,
|
||||
float* restrict b,
|
||||
float* restrict c, inc_t rs_c, inc_t cs_c,
|
||||
float* restrict a11,
|
||||
float* restrict b11,
|
||||
float* restrict c11, inc_t rs_c, inc_t cs_c,
|
||||
auxinfo_t* data
|
||||
)
|
||||
{
|
||||
/* Just call the reference implementation. */
|
||||
bli_strsm_u_ref_mxn( a,
|
||||
b,
|
||||
c, rs_c, cs_c,
|
||||
BLIS_STRSM_U_UKERNEL_REF( a11,
|
||||
b11,
|
||||
c11, rs_c, cs_c,
|
||||
data );
|
||||
}
|
||||
|
||||
|
||||
|
||||
void bli_dtrsm_u_opt_mxn(
|
||||
double* restrict a,
|
||||
double* restrict b,
|
||||
double* restrict c, inc_t rs_c, inc_t cs_c,
|
||||
double* restrict a11,
|
||||
double* restrict b11,
|
||||
double* restrict c11, inc_t rs_c, inc_t cs_c,
|
||||
auxinfo_t* data
|
||||
)
|
||||
{
|
||||
@@ -168,18 +168,18 @@ void bli_dtrsm_u_opt_mxn(
|
||||
{
|
||||
i = m - iter - 1;
|
||||
n_behind = iter;
|
||||
alpha11 = a + (i )*rs_a + (i )*cs_a;
|
||||
a12t = a + (i )*rs_a + (i+1)*cs_a;
|
||||
x1 = b + (i )*rs_b + (0 )*cs_b;
|
||||
X2 = b + (i+1)*rs_b + (0 )*cs_b;
|
||||
alpha11 = a11 + (i )*rs_a + (i )*cs_a;
|
||||
a12t = a11 + (i )*rs_a + (i+1)*cs_a;
|
||||
x1 = b11 + (i )*rs_b + (0 )*cs_b;
|
||||
X2 = b11 + (i+1)*rs_b + (0 )*cs_b;
|
||||
|
||||
/* x1 = x1 - a12t * X2; */
|
||||
/* x1 = x1 / alpha11; */
|
||||
for ( j = 0; j < n; ++j )
|
||||
{
|
||||
chi11 = x1 + (0 )*rs_b + (j )*cs_b;
|
||||
x21 = X2 + (0 )*rs_b + (j )*cs_b;
|
||||
gamma11 = c + (i )*rs_c + (j )*cs_c;
|
||||
chi11 = x1 + (0 )*rs_b + (j )*cs_b;
|
||||
x21 = X2 + (0 )*rs_b + (j )*cs_b;
|
||||
gamma11 = c11 + (i )*rs_c + (j )*cs_c;
|
||||
|
||||
/* chi11 = chi11 - a12t * x21; */
|
||||
bli_dset0s( rho11 );
|
||||
@@ -208,32 +208,32 @@ void bli_dtrsm_u_opt_mxn(
|
||||
|
||||
|
||||
void bli_ctrsm_u_opt_mxn(
|
||||
scomplex* restrict a,
|
||||
scomplex* restrict b,
|
||||
scomplex* restrict c, inc_t rs_c, inc_t cs_c,
|
||||
scomplex* restrict a11,
|
||||
scomplex* restrict b11,
|
||||
scomplex* restrict c11, inc_t rs_c, inc_t cs_c,
|
||||
auxinfo_t* data
|
||||
)
|
||||
{
|
||||
/* Just call the reference implementation. */
|
||||
bli_ctrsm_u_ref_mxn( a,
|
||||
b,
|
||||
c, rs_c, cs_c,
|
||||
BLIS_CTRSM_U_UKERNEL_REF( a11,
|
||||
b11,
|
||||
c11, rs_c, cs_c,
|
||||
data );
|
||||
}
|
||||
|
||||
|
||||
|
||||
void bli_ztrsm_u_opt_mxn(
|
||||
dcomplex* restrict a,
|
||||
dcomplex* restrict b,
|
||||
dcomplex* restrict c, inc_t rs_c, inc_t cs_c,
|
||||
dcomplex* restrict a11,
|
||||
dcomplex* restrict b11,
|
||||
dcomplex* restrict c11, inc_t rs_c, inc_t cs_c,
|
||||
auxinfo_t* data
|
||||
)
|
||||
{
|
||||
/* Just call the reference implementation. */
|
||||
bli_ztrsm_u_ref_mxn( a,
|
||||
b,
|
||||
c, rs_c, cs_c,
|
||||
BLIS_ZTRSM_U_UKERNEL_REF( a11,
|
||||
b11,
|
||||
c11, rs_c, cs_c,
|
||||
data );
|
||||
}
|
||||
|
||||
|
||||
@@ -1,50 +0,0 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
|
||||
//
|
||||
// Prototype micro-kernel interfaces.
|
||||
//
|
||||
#undef GENTPROT
|
||||
#define GENTPROT( ctype, ch, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
ctype* restrict a, \
|
||||
ctype* restrict b, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
auxinfo_t* data \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT_BASIC( trsm_u_opt_mxn )
|
||||
|
||||
@@ -53,7 +53,7 @@ void PASTEMAC0(opname)( \
|
||||
y ); \
|
||||
}
|
||||
|
||||
GENFRONT( addv, ADDV_KERNEL )
|
||||
GENFRONT( addv, addv_kernel )
|
||||
|
||||
|
||||
//
|
||||
|
||||
@@ -33,7 +33,8 @@
|
||||
*/
|
||||
|
||||
#include "bli_addv_check.h"
|
||||
#include "bli_addv_unb_var1.h"
|
||||
#include "bli_addv_kernel.h"
|
||||
#include "bli_addv_ref.h"
|
||||
|
||||
|
||||
//
|
||||
|
||||
115
frame/1/addv/bli_addv_kernel.c
Normal file
115
frame/1/addv/bli_addv_kernel.c
Normal file
@@ -0,0 +1,115 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
#define FUNCPTR_T addv_fp
|
||||
|
||||
typedef void (*FUNCPTR_T)(
|
||||
conj_t conjx,
|
||||
dim_t n,
|
||||
void* x, inc_t incx,
|
||||
void* y, inc_t incy
|
||||
);
|
||||
|
||||
// If some mixed datatype functions will not be compiled, we initialize
|
||||
// the corresponding elements of the function array to NULL.
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
static FUNCPTR_T GENARRAY2_ALL(ftypes,addv_kernel_void);
|
||||
#else
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
static FUNCPTR_T GENARRAY2_EXT(ftypes,addv_kernel_void);
|
||||
#else
|
||||
static FUNCPTR_T GENARRAY2_MIN(ftypes,addv_kernel_void);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
void bli_addv_kernel( obj_t* x,
|
||||
obj_t* y )
|
||||
{
|
||||
num_t dt_x = bli_obj_datatype( *x );
|
||||
num_t dt_y = bli_obj_datatype( *y );
|
||||
|
||||
conj_t conjx = bli_obj_conj_status( *x );
|
||||
dim_t n = bli_obj_vector_dim( *x );
|
||||
|
||||
inc_t inc_x = bli_obj_vector_inc( *x );
|
||||
void* buf_x = bli_obj_buffer_at_off( *x );
|
||||
|
||||
inc_t inc_y = bli_obj_vector_inc( *y );
|
||||
void* buf_y = bli_obj_buffer_at_off( *y );
|
||||
|
||||
FUNCPTR_T f;
|
||||
|
||||
// Index into the type combination array to extract the correct
|
||||
// function pointer.
|
||||
f = ftypes[dt_x][dt_y];
|
||||
|
||||
// Invoke the function.
|
||||
f( conjx,
|
||||
n,
|
||||
buf_x, inc_x,
|
||||
buf_y, inc_y );
|
||||
}
|
||||
|
||||
|
||||
#undef GENTFUNC2
|
||||
#define GENTFUNC2( ctype_x, ctype_y, chx, chy, varname, kername ) \
|
||||
\
|
||||
void PASTEMAC2(chx,chy,varname)( \
|
||||
conj_t conjx, \
|
||||
dim_t n, \
|
||||
void* x, inc_t incx, \
|
||||
void* y, inc_t incy \
|
||||
) \
|
||||
{ \
|
||||
PASTEMAC2(chx,chy,kername)( conjx, \
|
||||
n, \
|
||||
x, incx, \
|
||||
y, incy ); \
|
||||
}
|
||||
|
||||
// Define the basic set of functions unconditionally, and then also some
|
||||
// mixed datatype functions if requested.
|
||||
INSERT_GENTFUNC2_BASIC( addv_kernel_void, ADDV_KERNEL )
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
INSERT_GENTFUNC2_MIX_D( addv_kernel_void, ADDV_KERNEL )
|
||||
#endif
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
INSERT_GENTFUNC2_MIX_P( addv_kernel_void, ADDV_KERNEL )
|
||||
#endif
|
||||
|
||||
@@ -32,10 +32,14 @@
|
||||
|
||||
*/
|
||||
|
||||
void bli_subv_unb_var1( obj_t* x,
|
||||
obj_t* y );
|
||||
void bli_addv_kernel( obj_t* x,
|
||||
obj_t* y );
|
||||
|
||||
|
||||
//
|
||||
// Prototype the void pointer kernel wrappers.
|
||||
//
|
||||
|
||||
#undef GENTPROT2
|
||||
#define GENTPROT2( ctype_x, ctype_y, chx, chy, varname ) \
|
||||
\
|
||||
@@ -46,12 +50,13 @@ void PASTEMAC2(chx,chy,varname)( \
|
||||
void* y, inc_t incy \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT2_BASIC( subv_unb_var1 )
|
||||
INSERT_GENTPROT2_BASIC( addv_kernel_void )
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
INSERT_GENTPROT2_MIX_D( subv_unb_var1 )
|
||||
INSERT_GENTPROT2_MIX_D( addv_kernel_void )
|
||||
#endif
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
INSERT_GENTPROT2_MIX_P( subv_unb_var1 )
|
||||
INSERT_GENTPROT2_MIX_P( addv_kernel_void )
|
||||
#endif
|
||||
|
||||
@@ -34,6 +34,7 @@
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
/*
|
||||
#define FUNCPTR_T addv_fp
|
||||
|
||||
typedef void (*FUNCPTR_T)(
|
||||
@@ -46,17 +47,17 @@ typedef void (*FUNCPTR_T)(
|
||||
// If some mixed datatype functions will not be compiled, we initialize
|
||||
// the corresponding elements of the function array to NULL.
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
static FUNCPTR_T GENARRAY2_ALL(ftypes,addv_unb_var1);
|
||||
static FUNCPTR_T GENARRAY2_ALL(ftypes,addv_ref);
|
||||
#else
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
static FUNCPTR_T GENARRAY2_EXT(ftypes,addv_unb_var1);
|
||||
static FUNCPTR_T GENARRAY2_EXT(ftypes,addv_ref);
|
||||
#else
|
||||
static FUNCPTR_T GENARRAY2_MIN(ftypes,addv_unb_var1);
|
||||
static FUNCPTR_T GENARRAY2_MIN(ftypes,addv_ref);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
void bli_addv_unb_var1( obj_t* x,
|
||||
void bli_addv_ref( obj_t* x,
|
||||
obj_t* y )
|
||||
{
|
||||
num_t dt_x = bli_obj_datatype( *x );
|
||||
@@ -83,17 +84,19 @@ void bli_addv_unb_var1( obj_t* x,
|
||||
buf_x, inc_x,
|
||||
buf_y, inc_y );
|
||||
}
|
||||
*/
|
||||
|
||||
|
||||
#undef GENTFUNC2
|
||||
#define GENTFUNC2( ctype_x, ctype_y, chx, chy, opname, varname ) \
|
||||
\
|
||||
void PASTEMAC2(chx,chy,varname)( \
|
||||
conj_t conjx, \
|
||||
dim_t n, \
|
||||
void* x, inc_t incx, \
|
||||
void* y, inc_t incy \
|
||||
) \
|
||||
void PASTEMAC2(chx,chy,varname) \
|
||||
( \
|
||||
conj_t conjx, \
|
||||
dim_t n, \
|
||||
ctype_x* restrict x, inc_t incx, \
|
||||
ctype_y* restrict y, inc_t incy \
|
||||
) \
|
||||
{ \
|
||||
ctype_x* x_cast = x; \
|
||||
ctype_y* y_cast = y; \
|
||||
@@ -130,13 +133,13 @@ void PASTEMAC2(chx,chy,varname)( \
|
||||
|
||||
// Define the basic set of functions unconditionally, and then also some
|
||||
// mixed datatype functions if requested.
|
||||
INSERT_GENTFUNC2_BASIC( addv, addv_unb_var1 )
|
||||
INSERT_GENTFUNC2_BASIC( addv, addv_ref )
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
INSERT_GENTFUNC2_MIX_D( addv, addv_unb_var1 )
|
||||
INSERT_GENTFUNC2_MIX_D( addv, addv_ref )
|
||||
#endif
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
INSERT_GENTFUNC2_MIX_P( addv, addv_unb_var1 )
|
||||
INSERT_GENTFUNC2_MIX_P( addv, addv_ref )
|
||||
#endif
|
||||
|
||||
60
frame/1/addv/bli_addv_ref.h
Normal file
60
frame/1/addv/bli_addv_ref.h
Normal file
@@ -0,0 +1,60 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
/*
|
||||
void bli_addv_ref( obj_t* x,
|
||||
obj_t* y );
|
||||
*/
|
||||
|
||||
|
||||
#undef GENTPROT2
|
||||
#define GENTPROT2( ctype_x, ctype_y, chx, chy, varname ) \
|
||||
\
|
||||
void PASTEMAC2(chx,chy,varname) \
|
||||
( \
|
||||
conj_t conjx, \
|
||||
dim_t n, \
|
||||
ctype_x* restrict x, inc_t incx, \
|
||||
ctype_y* restrict y, inc_t incy \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT2_BASIC( addv_ref )
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
INSERT_GENTPROT2_MIX_D( addv_ref )
|
||||
#endif
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
INSERT_GENTPROT2_MIX_P( addv_ref )
|
||||
#endif
|
||||
@@ -68,7 +68,7 @@ void PASTEMAC0(opname)( \
|
||||
y ); \
|
||||
}
|
||||
|
||||
GENFRONT( axpyv, AXPYV_KERNEL )
|
||||
GENFRONT( axpyv, axpyv_kernel )
|
||||
|
||||
|
||||
//
|
||||
|
||||
@@ -33,7 +33,8 @@
|
||||
*/
|
||||
|
||||
#include "bli_axpyv_check.h"
|
||||
#include "bli_axpyv_unb_var1.h"
|
||||
#include "bli_axpyv_kernel.h"
|
||||
#include "bli_axpyv_ref.h"
|
||||
|
||||
|
||||
//
|
||||
|
||||
128
frame/1/axpyv/bli_axpyv_kernel.c
Normal file
128
frame/1/axpyv/bli_axpyv_kernel.c
Normal file
@@ -0,0 +1,128 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
#define FUNCPTR_T axpyv_fp
|
||||
|
||||
typedef void (*FUNCPTR_T)(
|
||||
conj_t conjx,
|
||||
dim_t n,
|
||||
void* alpha,
|
||||
void* x, inc_t incx,
|
||||
void* y, inc_t incy
|
||||
);
|
||||
|
||||
// If some mixed datatype functions will not be compiled, we initialize
|
||||
// the corresponding elements of the function array to NULL.
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
static FUNCPTR_T GENARRAY3_ALL(ftypes,axpyv_kernel_void);
|
||||
#else
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
static FUNCPTR_T GENARRAY3_EXT(ftypes,axpyv_kernel_void);
|
||||
#else
|
||||
static FUNCPTR_T GENARRAY3_MIN(ftypes,axpyv_kernel_void);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
void bli_axpyv_kernel( obj_t* alpha,
|
||||
obj_t* x,
|
||||
obj_t* y )
|
||||
{
|
||||
num_t dt_x = bli_obj_datatype( *x );
|
||||
num_t dt_y = bli_obj_datatype( *y );
|
||||
|
||||
conj_t conjx = bli_obj_conj_status( *x );
|
||||
dim_t n = bli_obj_vector_dim( *x );
|
||||
|
||||
inc_t inc_x = bli_obj_vector_inc( *x );
|
||||
void* buf_x = bli_obj_buffer_at_off( *x );
|
||||
|
||||
inc_t inc_y = bli_obj_vector_inc( *y );
|
||||
void* buf_y = bli_obj_buffer_at_off( *y );
|
||||
|
||||
num_t dt_alpha;
|
||||
void* buf_alpha;
|
||||
|
||||
FUNCPTR_T f;
|
||||
|
||||
// If alpha is a scalar constant, use dt_x to extract the address of the
|
||||
// corresponding constant value; otherwise, use the datatype encoded
|
||||
// within the alpha object and extract the buffer at the alpha offset.
|
||||
bli_set_scalar_dt_buffer( alpha, dt_x, dt_alpha, buf_alpha );
|
||||
|
||||
// Index into the type combination array to extract the correct
|
||||
// function pointer.
|
||||
f = ftypes[dt_alpha][dt_x][dt_y];
|
||||
|
||||
// Invoke the function.
|
||||
f( conjx,
|
||||
n,
|
||||
buf_alpha,
|
||||
buf_x, inc_x,
|
||||
buf_y, inc_y );
|
||||
}
|
||||
|
||||
|
||||
#undef GENTFUNC3
|
||||
#define GENTFUNC3( ctype_a, ctype_x, ctype_y, cha, chx, chy, varname, kername ) \
|
||||
\
|
||||
void PASTEMAC3(cha,chx,chy,varname)( \
|
||||
conj_t conjx, \
|
||||
dim_t n, \
|
||||
void* alpha, \
|
||||
void* x, inc_t incx, \
|
||||
void* y, inc_t incy \
|
||||
) \
|
||||
{ \
|
||||
PASTEMAC3(cha,chx,chy,kername)( conjx, \
|
||||
n, \
|
||||
alpha, \
|
||||
x, incx, \
|
||||
y, incy ); \
|
||||
}
|
||||
|
||||
// Define the basic set of functions unconditionally, and then also some
|
||||
// mixed datatype functions if requested.
|
||||
INSERT_GENTFUNC3_BASIC( axpyv_kernel_void, AXPYV_KERNEL )
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
INSERT_GENTFUNC3_MIX_D( axpyv_kernel_void, AXPYV_KERNEL )
|
||||
#endif
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
INSERT_GENTFUNC3_MIX_P( axpyv_kernel_void, AXPYV_KERNEL )
|
||||
#endif
|
||||
|
||||
@@ -32,11 +32,15 @@
|
||||
|
||||
*/
|
||||
|
||||
void bli_axpyv_unb_var1( obj_t* alpha,
|
||||
obj_t* x,
|
||||
obj_t* y );
|
||||
void bli_axpyv_kernel( obj_t* alpha,
|
||||
obj_t* x,
|
||||
obj_t* y );
|
||||
|
||||
|
||||
//
|
||||
// Prototype the void pointer kernel wrappers.
|
||||
//
|
||||
|
||||
#undef GENTPROT3
|
||||
#define GENTPROT3( ctype_a, ctype_x, ctype_y, cha, chx, chy, varname ) \
|
||||
\
|
||||
@@ -48,13 +52,13 @@ void PASTEMAC3(cha,chx,chy,varname)( \
|
||||
void* y, inc_t incy \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT3_BASIC( axpyv_unb_var1 )
|
||||
INSERT_GENTPROT3_BASIC( axpyv_kernel_void )
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
INSERT_GENTPROT3_MIX_D( axpyv_unb_var1 )
|
||||
INSERT_GENTPROT3_MIX_D( axpyv_kernel_void )
|
||||
#endif
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
INSERT_GENTPROT3_MIX_P( axpyv_unb_var1 )
|
||||
INSERT_GENTPROT3_MIX_P( axpyv_kernel_void )
|
||||
#endif
|
||||
|
||||
@@ -33,7 +33,7 @@
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
/*
|
||||
#define FUNCPTR_T axpyv_fp
|
||||
|
||||
typedef void (*FUNCPTR_T)(
|
||||
@@ -47,17 +47,17 @@ typedef void (*FUNCPTR_T)(
|
||||
// If some mixed datatype functions will not be compiled, we initialize
|
||||
// the corresponding elements of the function array to NULL.
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
static FUNCPTR_T GENARRAY3_ALL(ftypes,axpyv_unb_var1);
|
||||
static FUNCPTR_T GENARRAY3_ALL(ftypes,axpyv_ref);
|
||||
#else
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
static FUNCPTR_T GENARRAY3_EXT(ftypes,axpyv_unb_var1);
|
||||
static FUNCPTR_T GENARRAY3_EXT(ftypes,axpyv_ref);
|
||||
#else
|
||||
static FUNCPTR_T GENARRAY3_MIN(ftypes,axpyv_unb_var1);
|
||||
static FUNCPTR_T GENARRAY3_MIN(ftypes,axpyv_ref);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
void bli_axpyv_unb_var1( obj_t* alpha,
|
||||
void bli_axpyv_ref( obj_t* alpha,
|
||||
obj_t* x,
|
||||
obj_t* y )
|
||||
{
|
||||
@@ -94,18 +94,19 @@ void bli_axpyv_unb_var1( obj_t* alpha,
|
||||
buf_x, inc_x,
|
||||
buf_y, inc_y );
|
||||
}
|
||||
|
||||
*/
|
||||
|
||||
#undef GENTFUNC3
|
||||
#define GENTFUNC3( ctype_a, ctype_x, ctype_y, cha, chx, chy, varname, addvker ) \
|
||||
\
|
||||
void PASTEMAC3(cha,chx,chy,varname)( \
|
||||
conj_t conjx, \
|
||||
dim_t n, \
|
||||
void* alpha, \
|
||||
void* x, inc_t incx, \
|
||||
void* y, inc_t incy \
|
||||
) \
|
||||
void PASTEMAC3(cha,chx,chy,varname) \
|
||||
( \
|
||||
conj_t conjx, \
|
||||
dim_t n, \
|
||||
ctype_a* restrict alpha, \
|
||||
ctype_x* restrict x, inc_t incx, \
|
||||
ctype_y* restrict y, inc_t incy \
|
||||
) \
|
||||
{ \
|
||||
ctype_a* alpha_cast = alpha; \
|
||||
ctype_x* x_cast = x; \
|
||||
@@ -156,13 +157,13 @@ void PASTEMAC3(cha,chx,chy,varname)( \
|
||||
|
||||
// Define the basic set of functions unconditionally, and then also some
|
||||
// mixed datatype functions if requested.
|
||||
INSERT_GENTFUNC3_BASIC( axpyv_unb_var1, ADDV_KERNEL )
|
||||
INSERT_GENTFUNC3_BASIC( axpyv_ref, ADDV_KERNEL )
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
INSERT_GENTFUNC3_MIX_D( axpyv_unb_var1, ADDV_KERNEL )
|
||||
INSERT_GENTFUNC3_MIX_D( axpyv_ref, ADDV_KERNEL )
|
||||
#endif
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
INSERT_GENTFUNC3_MIX_P( axpyv_unb_var1, ADDV_KERNEL )
|
||||
INSERT_GENTFUNC3_MIX_P( axpyv_ref, ADDV_KERNEL )
|
||||
#endif
|
||||
|
||||
@@ -32,28 +32,32 @@
|
||||
|
||||
*/
|
||||
|
||||
/*
|
||||
void bli_axpyv_ref( obj_t* alpha,
|
||||
obj_t* x,
|
||||
obj_t* y );
|
||||
*/
|
||||
|
||||
|
||||
//
|
||||
// Prototype axpyv kernel interfaces.
|
||||
//
|
||||
#undef GENTPROT3
|
||||
#define GENTPROT3( ctype_a, ctype_x, ctype_y, cha, chx, chy, varname ) \
|
||||
\
|
||||
void PASTEMAC3(cha,chx,chy,varname)( \
|
||||
conj_t conjx, \
|
||||
dim_t n, \
|
||||
ctype_a* restrict alpha, \
|
||||
ctype_x* restrict x, inc_t incx, \
|
||||
ctype_y* restrict y, inc_t incy \
|
||||
);
|
||||
void PASTEMAC3(cha,chx,chy,varname) \
|
||||
( \
|
||||
conj_t conjx, \
|
||||
dim_t n, \
|
||||
ctype_a* restrict alpha, \
|
||||
ctype_x* restrict x, inc_t incx, \
|
||||
ctype_y* restrict y, inc_t incy \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT3_BASIC( axpyv_opt_var1 )
|
||||
INSERT_GENTPROT3_BASIC( axpyv_ref )
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
INSERT_GENTPROT3_MIX_D( axpyv_opt_var1 )
|
||||
INSERT_GENTPROT3_MIX_D( axpyv_ref )
|
||||
#endif
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
INSERT_GENTPROT3_MIX_P( axpyv_opt_var1 )
|
||||
INSERT_GENTPROT3_MIX_P( axpyv_ref )
|
||||
#endif
|
||||
|
||||
@@ -34,16 +34,6 @@
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
/*
|
||||
void bli_copyv( obj_t* x,
|
||||
obj_t* y )
|
||||
{
|
||||
if ( bli_error_checking_is_enabled() )
|
||||
bli_copyv_check( x, y );
|
||||
|
||||
bli_copyv_unb_var1( x, y );
|
||||
}
|
||||
*/
|
||||
|
||||
//
|
||||
// Define object-based interface.
|
||||
@@ -63,7 +53,7 @@ void PASTEMAC0(opname)( \
|
||||
y ); \
|
||||
}
|
||||
|
||||
GENFRONT( copyv, COPYV_KERNEL )
|
||||
GENFRONT( copyv, copyv_kernel )
|
||||
|
||||
|
||||
//
|
||||
|
||||
@@ -33,7 +33,8 @@
|
||||
*/
|
||||
|
||||
#include "bli_copyv_check.h"
|
||||
#include "bli_copyv_unb_var1.h"
|
||||
#include "bli_copyv_kernel.h"
|
||||
#include "bli_copyv_ref.h"
|
||||
|
||||
|
||||
//
|
||||
|
||||
115
frame/1/copyv/bli_copyv_kernel.c
Normal file
115
frame/1/copyv/bli_copyv_kernel.c
Normal file
@@ -0,0 +1,115 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
#define FUNCPTR_T copyv_fp
|
||||
|
||||
typedef void (*FUNCPTR_T)(
|
||||
conj_t conjx,
|
||||
dim_t n,
|
||||
void* x, inc_t incx,
|
||||
void* y, inc_t incy
|
||||
);
|
||||
|
||||
// If some mixed datatype functions will not be compiled, we initialize
|
||||
// the corresponding elements of the function array to NULL.
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
static FUNCPTR_T GENARRAY2_ALL(ftypes,copyv_kernel_void);
|
||||
#else
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
static FUNCPTR_T GENARRAY2_EXT(ftypes,copyv_kernel_void);
|
||||
#else
|
||||
static FUNCPTR_T GENARRAY2_MIN(ftypes,copyv_kernel_void);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
void bli_copyv_kernel( obj_t* x,
|
||||
obj_t* y )
|
||||
{
|
||||
num_t dt_x = bli_obj_datatype( *x );
|
||||
num_t dt_y = bli_obj_datatype( *y );
|
||||
|
||||
conj_t conjx = bli_obj_conj_status( *x );
|
||||
dim_t n = bli_obj_vector_dim( *x );
|
||||
|
||||
inc_t inc_x = bli_obj_vector_inc( *x );
|
||||
void* buf_x = bli_obj_buffer_at_off( *x );
|
||||
|
||||
inc_t inc_y = bli_obj_vector_inc( *y );
|
||||
void* buf_y = bli_obj_buffer_at_off( *y );
|
||||
|
||||
FUNCPTR_T f;
|
||||
|
||||
// Index into the type combination array to extract the correct
|
||||
// function pointer.
|
||||
f = ftypes[dt_x][dt_y];
|
||||
|
||||
// Invoke the function.
|
||||
f( conjx,
|
||||
n,
|
||||
buf_x, inc_x,
|
||||
buf_y, inc_y );
|
||||
}
|
||||
|
||||
|
||||
#undef GENTFUNC2
|
||||
#define GENTFUNC2( ctype_x, ctype_y, chx, chy, varname, kername ) \
|
||||
\
|
||||
void PASTEMAC2(chx,chy,varname)( \
|
||||
conj_t conjx, \
|
||||
dim_t n, \
|
||||
void* x, inc_t incx, \
|
||||
void* y, inc_t incy \
|
||||
) \
|
||||
{ \
|
||||
PASTEMAC2(chx,chy,kername)( conjx, \
|
||||
n, \
|
||||
x, incx, \
|
||||
y, incy ); \
|
||||
}
|
||||
|
||||
// Define the basic set of functions unconditionally, and then also some
|
||||
// mixed datatype functions if requested.
|
||||
INSERT_GENTFUNC2_BASIC( copyv_kernel_void, COPYV_KERNEL )
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
INSERT_GENTFUNC2_MIX_D( copyv_kernel_void, COPYV_KERNEL )
|
||||
#endif
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
INSERT_GENTFUNC2_MIX_P( copyv_kernel_void, COPYV_KERNEL )
|
||||
#endif
|
||||
|
||||
@@ -32,10 +32,14 @@
|
||||
|
||||
*/
|
||||
|
||||
void bli_copyv_unb_var1( obj_t* x,
|
||||
obj_t* y );
|
||||
void bli_copyv_kernel( obj_t* x,
|
||||
obj_t* y );
|
||||
|
||||
|
||||
//
|
||||
// Prototype the void pointer kernel wrappers.
|
||||
//
|
||||
|
||||
#undef GENTPROT2
|
||||
#define GENTPROT2( ctype_x, ctype_y, chx, chy, varname ) \
|
||||
\
|
||||
@@ -46,12 +50,12 @@ void PASTEMAC2(chx,chy,varname)( \
|
||||
void* y, inc_t incy \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT2_BASIC( copyv_unb_var1 )
|
||||
INSERT_GENTPROT2_BASIC( copyv_kernel_void )
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
INSERT_GENTPROT2_MIX_D( copyv_unb_var1 )
|
||||
INSERT_GENTPROT2_MIX_D( copyv_kernel_void )
|
||||
#endif
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
INSERT_GENTPROT2_MIX_P( copyv_unb_var1 )
|
||||
INSERT_GENTPROT2_MIX_P( copyv_kernel_void )
|
||||
#endif
|
||||
@@ -34,6 +34,7 @@
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
/*
|
||||
#define FUNCPTR_T copyv_fp
|
||||
|
||||
typedef void (*FUNCPTR_T)(
|
||||
@@ -46,17 +47,17 @@ typedef void (*FUNCPTR_T)(
|
||||
// If some mixed datatype functions will not be compiled, we initialize
|
||||
// the corresponding elements of the function array to NULL.
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
static FUNCPTR_T GENARRAY2_ALL(ftypes,copyv_unb_var1);
|
||||
static FUNCPTR_T GENARRAY2_ALL(ftypes,copyv_ref);
|
||||
#else
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
static FUNCPTR_T GENARRAY2_EXT(ftypes,copyv_unb_var1);
|
||||
static FUNCPTR_T GENARRAY2_EXT(ftypes,copyv_ref);
|
||||
#else
|
||||
static FUNCPTR_T GENARRAY2_MIN(ftypes,copyv_unb_var1);
|
||||
static FUNCPTR_T GENARRAY2_MIN(ftypes,copyv_ref);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
void bli_copyv_unb_var1( obj_t* x,
|
||||
void bli_copyv_ref( obj_t* x,
|
||||
obj_t* y )
|
||||
{
|
||||
num_t dt_x = bli_obj_datatype( *x );
|
||||
@@ -83,17 +84,19 @@ void bli_copyv_unb_var1( obj_t* x,
|
||||
buf_x, inc_x,
|
||||
buf_y, inc_y );
|
||||
}
|
||||
*/
|
||||
|
||||
|
||||
#undef GENTFUNC2
|
||||
#define GENTFUNC2( ctype_x, ctype_y, chx, chy, opname, varname ) \
|
||||
\
|
||||
void PASTEMAC2(chx,chy,varname)( \
|
||||
conj_t conjx, \
|
||||
dim_t n, \
|
||||
void* x, inc_t incx, \
|
||||
void* y, inc_t incy \
|
||||
) \
|
||||
void PASTEMAC2(chx,chy,varname) \
|
||||
( \
|
||||
conj_t conjx, \
|
||||
dim_t n, \
|
||||
ctype_x* restrict x, inc_t incx, \
|
||||
ctype_y* restrict y, inc_t incy \
|
||||
) \
|
||||
{ \
|
||||
ctype_x* x_cast = x; \
|
||||
ctype_y* y_cast = y; \
|
||||
@@ -130,13 +133,13 @@ void PASTEMAC2(chx,chy,varname)( \
|
||||
|
||||
// Define the basic set of functions unconditionally, and then also some
|
||||
// mixed datatype functions if requested.
|
||||
INSERT_GENTFUNC2_BASIC( copyv, copyv_unb_var1 )
|
||||
INSERT_GENTFUNC2_BASIC( copyv, copyv_ref )
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
INSERT_GENTFUNC2_MIX_D( copyv, copyv_unb_var1 )
|
||||
INSERT_GENTFUNC2_MIX_D( copyv, copyv_ref )
|
||||
#endif
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
INSERT_GENTFUNC2_MIX_P( copyv, copyv_unb_var1 )
|
||||
INSERT_GENTFUNC2_MIX_P( copyv, copyv_ref )
|
||||
#endif
|
||||
|
||||
60
frame/1/copyv/bli_copyv_ref.h
Normal file
60
frame/1/copyv/bli_copyv_ref.h
Normal file
@@ -0,0 +1,60 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
/*
|
||||
void bli_copyv_ref( obj_t* x,
|
||||
obj_t* y );
|
||||
*/
|
||||
|
||||
|
||||
#undef GENTPROT2
|
||||
#define GENTPROT2( ctype_x, ctype_y, chx, chy, varname ) \
|
||||
\
|
||||
void PASTEMAC2(chx,chy,varname) \
|
||||
( \
|
||||
conj_t conjx, \
|
||||
dim_t n, \
|
||||
ctype_x* restrict x, inc_t incx, \
|
||||
ctype_y* restrict y, inc_t incy \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT2_BASIC( copyv_ref )
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
INSERT_GENTPROT2_MIX_D( copyv_ref )
|
||||
#endif
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
INSERT_GENTPROT2_MIX_P( copyv_ref )
|
||||
#endif
|
||||
@@ -55,7 +55,7 @@ void PASTEMAC0(opname)( \
|
||||
rho ); \
|
||||
}
|
||||
|
||||
GENFRONT( dotv, DOTV_KERNEL )
|
||||
GENFRONT( dotv, dotv_kernel )
|
||||
|
||||
|
||||
//
|
||||
|
||||
@@ -33,7 +33,8 @@
|
||||
*/
|
||||
|
||||
#include "bli_dotv_check.h"
|
||||
#include "bli_dotv_unb_var1.h"
|
||||
#include "bli_dotv_kernel.h"
|
||||
#include "bli_dotv_ref.h"
|
||||
|
||||
|
||||
//
|
||||
|
||||
128
frame/1/dotv/bli_dotv_kernel.c
Normal file
128
frame/1/dotv/bli_dotv_kernel.c
Normal file
@@ -0,0 +1,128 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
#define FUNCPTR_T dotv_fp
|
||||
|
||||
typedef void (*FUNCPTR_T)(
|
||||
conj_t conjx,
|
||||
conj_t conjy,
|
||||
dim_t n,
|
||||
void* x, inc_t incx,
|
||||
void* y, inc_t incy,
|
||||
void* rho
|
||||
);
|
||||
|
||||
// If some mixed datatype functions will not be compiled, we initialize
|
||||
// the corresponding elements of the function array to NULL.
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
static FUNCPTR_T GENARRAY3_ALL(ftypes,dotv_kernel_void);
|
||||
#else
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
static FUNCPTR_T GENARRAY3_EXT(ftypes,dotv_kernel_void);
|
||||
#else
|
||||
static FUNCPTR_T GENARRAY3_MIN(ftypes,dotv_kernel_void);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
void bli_dotv_kernel( obj_t* x,
|
||||
obj_t* y,
|
||||
obj_t* rho )
|
||||
{
|
||||
num_t dt_x = bli_obj_datatype( *x );
|
||||
num_t dt_y = bli_obj_datatype( *y );
|
||||
num_t dt_rho = bli_obj_datatype( *rho );
|
||||
|
||||
conj_t conjx = bli_obj_conj_status( *x );
|
||||
conj_t conjy = bli_obj_conj_status( *y );
|
||||
dim_t n = bli_obj_vector_dim( *x );
|
||||
|
||||
inc_t inc_x = bli_obj_vector_inc( *x );
|
||||
void* buf_x = bli_obj_buffer_at_off( *x );
|
||||
|
||||
inc_t inc_y = bli_obj_vector_inc( *y );
|
||||
void* buf_y = bli_obj_buffer_at_off( *y );
|
||||
|
||||
void* buf_rho = bli_obj_buffer_at_off( *rho );
|
||||
|
||||
FUNCPTR_T f;
|
||||
|
||||
// Index into the type combination array to extract the correct
|
||||
// function pointer.
|
||||
f = ftypes[dt_x][dt_y][dt_rho];
|
||||
|
||||
// Invoke the function.
|
||||
f( conjx,
|
||||
conjy,
|
||||
n,
|
||||
buf_x, inc_x,
|
||||
buf_y, inc_y,
|
||||
buf_rho );
|
||||
}
|
||||
|
||||
|
||||
#undef GENTFUNC3
|
||||
#define GENTFUNC3( ctype_x, ctype_y, ctype_r, chx, chy, chr, varname, kername ) \
|
||||
\
|
||||
void PASTEMAC3(chx,chy,chr,varname)( \
|
||||
conj_t conjx, \
|
||||
conj_t conjy, \
|
||||
dim_t n, \
|
||||
void* x, inc_t incx, \
|
||||
void* y, inc_t incy, \
|
||||
void* rho \
|
||||
) \
|
||||
{ \
|
||||
PASTEMAC3(chx,chy,chr,kername)( conjx, \
|
||||
conjy, \
|
||||
n, \
|
||||
x, incx, \
|
||||
y, incy, \
|
||||
rho ); \
|
||||
}
|
||||
|
||||
// Define the basic set of functions unconditionally, and then also some
|
||||
// mixed datatype functions if requested.
|
||||
INSERT_GENTFUNC3_BASIC( dotv_kernel_void, DOTV_KERNEL )
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
INSERT_GENTFUNC3_MIX_D( dotv_kernel_void, DOTV_KERNEL )
|
||||
#endif
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
INSERT_GENTFUNC3_MIX_P( dotv_kernel_void, DOTV_KERNEL )
|
||||
#endif
|
||||
|
||||
@@ -32,11 +32,15 @@
|
||||
|
||||
*/
|
||||
|
||||
void bli_dotv_unb_var1( obj_t* x,
|
||||
obj_t* y,
|
||||
obj_t* rho );
|
||||
void bli_dotv_kernel( obj_t* x,
|
||||
obj_t* y,
|
||||
obj_t* rho );
|
||||
|
||||
|
||||
//
|
||||
// Prototype the void pointer kernel wrappers.
|
||||
//
|
||||
|
||||
#undef GENTPROT3
|
||||
#define GENTPROT3( ctype_x, ctype_y, ctype_r, chx, chy, chr, varname ) \
|
||||
\
|
||||
@@ -49,13 +53,13 @@ void PASTEMAC3(chx,chy,chr,varname)( \
|
||||
void* rho \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT3_BASIC( dotv_unb_var1 )
|
||||
INSERT_GENTPROT3_BASIC( dotv_kernel_void )
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
INSERT_GENTPROT3_MIX_D( dotv_unb_var1 )
|
||||
INSERT_GENTPROT3_MIX_D( dotv_kernel_void )
|
||||
#endif
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
INSERT_GENTPROT3_MIX_P( dotv_unb_var1 )
|
||||
INSERT_GENTPROT3_MIX_P( dotv_kernel_void )
|
||||
#endif
|
||||
|
||||
@@ -34,6 +34,7 @@
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
/*
|
||||
#define FUNCPTR_T dotv_fp
|
||||
|
||||
typedef void (*FUNCPTR_T)(
|
||||
@@ -48,17 +49,17 @@ typedef void (*FUNCPTR_T)(
|
||||
// If some mixed datatype functions will not be compiled, we initialize
|
||||
// the corresponding elements of the function array to NULL.
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
static FUNCPTR_T GENARRAY3_ALL(ftypes,dotv_unb_var1);
|
||||
static FUNCPTR_T GENARRAY3_ALL(ftypes,dotv_ref);
|
||||
#else
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
static FUNCPTR_T GENARRAY3_EXT(ftypes,dotv_unb_var1);
|
||||
static FUNCPTR_T GENARRAY3_EXT(ftypes,dotv_ref);
|
||||
#else
|
||||
static FUNCPTR_T GENARRAY3_MIN(ftypes,dotv_unb_var1);
|
||||
static FUNCPTR_T GENARRAY3_MIN(ftypes,dotv_ref);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
void bli_dotv_unb_var1( obj_t* x,
|
||||
void bli_dotv_ref( obj_t* x,
|
||||
obj_t* y,
|
||||
obj_t* rho )
|
||||
{
|
||||
@@ -92,19 +93,20 @@ void bli_dotv_unb_var1( obj_t* x,
|
||||
buf_y, inc_y,
|
||||
buf_rho );
|
||||
}
|
||||
|
||||
*/
|
||||
|
||||
#undef GENTFUNC3
|
||||
#define GENTFUNC3( ctype_x, ctype_y, ctype_r, chx, chy, chr, opname, varname ) \
|
||||
\
|
||||
void PASTEMAC3(chx,chy,chr,varname)( \
|
||||
conj_t conjx, \
|
||||
conj_t conjy, \
|
||||
dim_t n, \
|
||||
void* x, inc_t incx, \
|
||||
void* y, inc_t incy, \
|
||||
void* rho \
|
||||
) \
|
||||
void PASTEMAC3(chx,chy,chr,varname) \
|
||||
( \
|
||||
conj_t conjx, \
|
||||
conj_t conjy, \
|
||||
dim_t n, \
|
||||
ctype_x* restrict x, inc_t incx, \
|
||||
ctype_y* restrict y, inc_t incy, \
|
||||
ctype_r* restrict rho \
|
||||
) \
|
||||
{ \
|
||||
ctype_x* x_cast = x; \
|
||||
ctype_y* y_cast = y; \
|
||||
@@ -163,13 +165,13 @@ void PASTEMAC3(chx,chy,chr,varname)( \
|
||||
|
||||
// Define the basic set of functions unconditionally, and then also some
|
||||
// mixed datatype functions if requested.
|
||||
INSERT_GENTFUNC3_BASIC( dotv, dotv_unb_var1 )
|
||||
INSERT_GENTFUNC3_BASIC( dotv, dotv_ref )
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
INSERT_GENTFUNC3_MIX_D( dotv, dotv_unb_var1 )
|
||||
INSERT_GENTFUNC3_MIX_D( dotv, dotv_ref )
|
||||
#endif
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
INSERT_GENTFUNC3_MIX_P( dotv, dotv_unb_var1 )
|
||||
INSERT_GENTFUNC3_MIX_P( dotv, dotv_ref )
|
||||
#endif
|
||||
|
||||
64
frame/1/dotv/bli_dotv_ref.h
Normal file
64
frame/1/dotv/bli_dotv_ref.h
Normal file
@@ -0,0 +1,64 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
/*
|
||||
void bli_dotv_ref( obj_t* x,
|
||||
obj_t* y,
|
||||
obj_t* rho );
|
||||
*/
|
||||
|
||||
|
||||
#undef GENTPROT3
|
||||
#define GENTPROT3( ctype_x, ctype_y, ctype_r, chx, chy, chr, varname ) \
|
||||
\
|
||||
void PASTEMAC3(chx,chy,chr,varname) \
|
||||
( \
|
||||
conj_t conjx, \
|
||||
conj_t conjy, \
|
||||
dim_t n, \
|
||||
ctype_x* restrict x, inc_t incx, \
|
||||
ctype_y* restrict y, inc_t incy, \
|
||||
ctype_r* restrict rho \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT3_BASIC( dotv_ref )
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
INSERT_GENTPROT3_MIX_D( dotv_ref )
|
||||
#endif
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
INSERT_GENTPROT3_MIX_P( dotv_ref )
|
||||
#endif
|
||||
|
||||
@@ -59,7 +59,7 @@ void PASTEMAC0(opname)( \
|
||||
rho ); \
|
||||
}
|
||||
|
||||
GENFRONT( dotxv, DOTXV_KERNEL )
|
||||
GENFRONT( dotxv, dotxv_kernel )
|
||||
|
||||
|
||||
//
|
||||
|
||||
@@ -33,7 +33,8 @@
|
||||
*/
|
||||
|
||||
#include "bli_dotxv_check.h"
|
||||
#include "bli_dotxv_unb_var1.h"
|
||||
#include "bli_dotxv_kernel.h"
|
||||
#include "bli_dotxv_ref.h"
|
||||
|
||||
|
||||
//
|
||||
|
||||
153
frame/1/dotxv/bli_dotxv_kernel.c
Normal file
153
frame/1/dotxv/bli_dotxv_kernel.c
Normal file
@@ -0,0 +1,153 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
#define FUNCPTR_T dotxv_fp
|
||||
|
||||
typedef void (*FUNCPTR_T)(
|
||||
conj_t conjx,
|
||||
conj_t conjy,
|
||||
dim_t n,
|
||||
void* alpha,
|
||||
void* x, inc_t incx,
|
||||
void* y, inc_t incy,
|
||||
void* beta,
|
||||
void* rho
|
||||
);
|
||||
|
||||
// If some mixed datatype functions will not be compiled, we initialize
|
||||
// the corresponding elements of the function array to NULL.
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
static FUNCPTR_T GENARRAY3_ALL(ftypes,dotxv_kernel_void);
|
||||
#else
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
static FUNCPTR_T GENARRAY3_EXT(ftypes,dotxv_kernel_void);
|
||||
#else
|
||||
static FUNCPTR_T GENARRAY3_MIN(ftypes,dotxv_kernel_void);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
void bli_dotxv_kernel( obj_t* alpha,
|
||||
obj_t* x,
|
||||
obj_t* y,
|
||||
obj_t* beta,
|
||||
obj_t* rho )
|
||||
{
|
||||
num_t dt_x = bli_obj_datatype( *x );
|
||||
num_t dt_y = bli_obj_datatype( *y );
|
||||
num_t dt_rho = bli_obj_datatype( *rho );
|
||||
|
||||
conj_t conjx = bli_obj_conj_status( *x );
|
||||
conj_t conjy = bli_obj_conj_status( *y );
|
||||
dim_t n = bli_obj_vector_dim( *x );
|
||||
|
||||
inc_t inc_x = bli_obj_vector_inc( *x );
|
||||
void* buf_x = bli_obj_buffer_at_off( *x );
|
||||
|
||||
inc_t inc_y = bli_obj_vector_inc( *y );
|
||||
void* buf_y = bli_obj_buffer_at_off( *y );
|
||||
|
||||
void* buf_rho = bli_obj_buffer_at_off( *rho );
|
||||
|
||||
num_t dt_alpha;
|
||||
void* buf_alpha;
|
||||
|
||||
num_t dt_beta;
|
||||
void* buf_beta;
|
||||
|
||||
FUNCPTR_T f;
|
||||
|
||||
// The datatype of alpha MUST be the type union of x and y. This is to
|
||||
// prevent any unnecessary loss of information during computation.
|
||||
dt_alpha = bli_datatype_union( dt_x, dt_y );
|
||||
buf_alpha = bli_obj_buffer_for_1x1( dt_alpha, *alpha );
|
||||
|
||||
// The datatype of beta MUST be the same as the datatype of rho.
|
||||
dt_beta = dt_rho;
|
||||
buf_beta = bli_obj_buffer_for_1x1( dt_beta, *beta );
|
||||
|
||||
// Index into the type combination array to extract the correct
|
||||
// function pointer.
|
||||
f = ftypes[dt_x][dt_y][dt_rho];
|
||||
|
||||
// Invoke the function.
|
||||
f( conjx,
|
||||
conjy,
|
||||
n,
|
||||
buf_alpha,
|
||||
buf_x, inc_x,
|
||||
buf_y, inc_y,
|
||||
buf_beta,
|
||||
buf_rho );
|
||||
}
|
||||
|
||||
|
||||
#undef GENTFUNC3U12
|
||||
#define GENTFUNC3U12( ctype_x, ctype_y, ctype_r, ctype_xy, chx, chy, chr, chxy, varname, kername ) \
|
||||
\
|
||||
void PASTEMAC3(chx,chy,chr,varname)( \
|
||||
conj_t conjx, \
|
||||
conj_t conjy, \
|
||||
dim_t n, \
|
||||
void* alpha, \
|
||||
void* x, inc_t incx, \
|
||||
void* y, inc_t incy, \
|
||||
void* beta, \
|
||||
void* rho \
|
||||
) \
|
||||
{ \
|
||||
PASTEMAC3(chx,chy,chr,kername)( conjx, \
|
||||
conjy, \
|
||||
n, \
|
||||
alpha, \
|
||||
x, incx, \
|
||||
y, incy, \
|
||||
beta, \
|
||||
rho ); \
|
||||
}
|
||||
|
||||
// Define the basic set of functions unconditionally, and then also some
|
||||
// mixed datatype functions if requested.
|
||||
INSERT_GENTFUNC3U12_BASIC( dotxv_kernel_void, DOTXV_KERNEL )
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
INSERT_GENTFUNC3U12_MIX_D( dotxv_kernel_void, DOTXV_KERNEL )
|
||||
#endif
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
INSERT_GENTFUNC3U12_MIX_P( dotxv_kernel_void, DOTXV_KERNEL )
|
||||
#endif
|
||||
|
||||
@@ -32,13 +32,17 @@
|
||||
|
||||
*/
|
||||
|
||||
void bli_dotxv_unb_var1( obj_t* alpha,
|
||||
obj_t* x,
|
||||
obj_t* y,
|
||||
obj_t* beta,
|
||||
obj_t* rho );
|
||||
void bli_dotxv_kernel( obj_t* alpha,
|
||||
obj_t* x,
|
||||
obj_t* y,
|
||||
obj_t* beta,
|
||||
obj_t* rho );
|
||||
|
||||
|
||||
//
|
||||
// Prototype the void pointer kernel wrappers.
|
||||
//
|
||||
|
||||
#undef GENTPROT3U12
|
||||
#define GENTPROT3U12( ctype_x, ctype_y, ctype_r, ctype_xy, chx, chy, chr, chxy, varname ) \
|
||||
\
|
||||
@@ -53,13 +57,13 @@ void PASTEMAC3(chx,chy,chr,varname)( \
|
||||
void* rho \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT3U12_BASIC( dotxv_unb_var1 )
|
||||
INSERT_GENTPROT3U12_BASIC( dotxv_kernel_void )
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
INSERT_GENTPROT3U12_MIX_D( dotxv_unb_var1 )
|
||||
INSERT_GENTPROT3U12_MIX_D( dotxv_kernel_void )
|
||||
#endif
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
INSERT_GENTPROT3U12_MIX_P( dotxv_unb_var1 )
|
||||
INSERT_GENTPROT3U12_MIX_P( dotxv_kernel_void )
|
||||
#endif
|
||||
|
||||
@@ -34,6 +34,7 @@
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
/*
|
||||
#define FUNCPTR_T dotxv_fp
|
||||
|
||||
typedef void (*FUNCPTR_T)(
|
||||
@@ -50,17 +51,17 @@ typedef void (*FUNCPTR_T)(
|
||||
// If some mixed datatype functions will not be compiled, we initialize
|
||||
// the corresponding elements of the function array to NULL.
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
static FUNCPTR_T GENARRAY3_ALL(ftypes,dotxv_unb_var1);
|
||||
static FUNCPTR_T GENARRAY3_ALL(ftypes,dotxv_ref);
|
||||
#else
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
static FUNCPTR_T GENARRAY3_EXT(ftypes,dotxv_unb_var1);
|
||||
static FUNCPTR_T GENARRAY3_EXT(ftypes,dotxv_ref);
|
||||
#else
|
||||
static FUNCPTR_T GENARRAY3_MIN(ftypes,dotxv_unb_var1);
|
||||
static FUNCPTR_T GENARRAY3_MIN(ftypes,dotxv_ref);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
void bli_dotxv_unb_var1( obj_t* alpha,
|
||||
void bli_dotxv_ref( obj_t* alpha,
|
||||
obj_t* x,
|
||||
obj_t* y,
|
||||
obj_t* beta,
|
||||
@@ -113,21 +114,23 @@ void bli_dotxv_unb_var1( obj_t* alpha,
|
||||
buf_beta,
|
||||
buf_rho );
|
||||
}
|
||||
*/
|
||||
|
||||
|
||||
#undef GENTFUNC3U12
|
||||
#define GENTFUNC3U12( ctype_x, ctype_y, ctype_r, ctype_xy, chx, chy, chr, chxy, opname, varname ) \
|
||||
\
|
||||
void PASTEMAC3(chx,chy,chr,varname)( \
|
||||
conj_t conjx, \
|
||||
conj_t conjy, \
|
||||
dim_t n, \
|
||||
void* alpha, \
|
||||
void* x, inc_t incx, \
|
||||
void* y, inc_t incy, \
|
||||
void* beta, \
|
||||
void* rho \
|
||||
) \
|
||||
void PASTEMAC3(chx,chy,chr,varname) \
|
||||
( \
|
||||
conj_t conjx, \
|
||||
conj_t conjy, \
|
||||
dim_t n, \
|
||||
ctype_xy* restrict alpha, \
|
||||
ctype_x* restrict x, inc_t incx, \
|
||||
ctype_y* restrict y, inc_t incy, \
|
||||
ctype_r* restrict beta, \
|
||||
ctype_r* restrict rho \
|
||||
) \
|
||||
{ \
|
||||
ctype_xy* alpha_cast = alpha; \
|
||||
ctype_x* x_cast = x; \
|
||||
@@ -194,13 +197,13 @@ void PASTEMAC3(chx,chy,chr,varname)( \
|
||||
|
||||
// Define the basic set of functions unconditionally, and then also some
|
||||
// mixed datatype functions if requested.
|
||||
INSERT_GENTFUNC3U12_BASIC( dotxv, dotxv_unb_var1 )
|
||||
INSERT_GENTFUNC3U12_BASIC( dotxv, dotxv_ref )
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
INSERT_GENTFUNC3U12_MIX_D( dotxv, dotxv_unb_var1 )
|
||||
INSERT_GENTFUNC3U12_MIX_D( dotxv, dotxv_ref )
|
||||
#endif
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
INSERT_GENTFUNC3U12_MIX_P( dotxv, dotxv_unb_var1 )
|
||||
INSERT_GENTFUNC3U12_MIX_P( dotxv, dotxv_ref )
|
||||
#endif
|
||||
|
||||
68
frame/1/dotxv/bli_dotxv_ref.h
Normal file
68
frame/1/dotxv/bli_dotxv_ref.h
Normal file
@@ -0,0 +1,68 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
/*
|
||||
void bli_dotxv_ref( obj_t* alpha,
|
||||
obj_t* x,
|
||||
obj_t* y,
|
||||
obj_t* beta,
|
||||
obj_t* rho );
|
||||
*/
|
||||
|
||||
|
||||
#undef GENTPROT3U12
|
||||
#define GENTPROT3U12( ctype_x, ctype_y, ctype_r, ctype_xy, chx, chy, chr, chxy, varname ) \
|
||||
\
|
||||
void PASTEMAC3(chx,chy,chr,varname) \
|
||||
( \
|
||||
conj_t conjx, \
|
||||
conj_t conjy, \
|
||||
dim_t n, \
|
||||
ctype_xy* restrict alpha, \
|
||||
ctype_x* restrict x, inc_t incx, \
|
||||
ctype_y* restrict y, inc_t incy, \
|
||||
ctype_r* restrict beta, \
|
||||
ctype_r* restrict rho \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT3U12_BASIC( dotxv_ref )
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
INSERT_GENTPROT3U12_MIX_D( dotxv_ref )
|
||||
#endif
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
INSERT_GENTPROT3U12_MIX_P( dotxv_ref )
|
||||
#endif
|
||||
|
||||
@@ -34,15 +34,6 @@
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
/*
|
||||
void bli_invertv( obj_t* x )
|
||||
{
|
||||
if ( bli_error_checking_is_enabled() )
|
||||
bli_invertv_check( x );
|
||||
|
||||
bli_invertv_unb_var1( x );
|
||||
}
|
||||
*/
|
||||
|
||||
//
|
||||
// Define object-based interface.
|
||||
@@ -60,7 +51,7 @@ void PASTEMAC0(opname)( \
|
||||
PASTEMAC0(varname)( x ); \
|
||||
}
|
||||
|
||||
GENFRONT( invertv, INVERTV_KERNEL )
|
||||
GENFRONT( invertv, invertv_kernel )
|
||||
|
||||
|
||||
//
|
||||
|
||||
@@ -33,7 +33,8 @@
|
||||
*/
|
||||
|
||||
#include "bli_invertv_check.h"
|
||||
#include "bli_invertv_unb_var1.h"
|
||||
#include "bli_invertv_kernel.h"
|
||||
#include "bli_invertv_ref.h"
|
||||
|
||||
|
||||
//
|
||||
|
||||
@@ -34,19 +34,48 @@
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
#define FUNCPTR_T invertv_fp
|
||||
|
||||
#undef GENTPROT
|
||||
#define GENTPROT( ctype, ch, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
dim_t k, \
|
||||
ctype* restrict alpha, \
|
||||
ctype* restrict a, \
|
||||
ctype* restrict b, \
|
||||
ctype* restrict beta, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
auxinfo_t* data \
|
||||
typedef void (*FUNCPTR_T)(
|
||||
dim_t n,
|
||||
void* x, inc_t incx
|
||||
);
|
||||
|
||||
INSERT_GENTPROT_BASIC( gemm_4x6 )
|
||||
static FUNCPTR_T GENARRAY(ftypes,invertv_kernel_void);
|
||||
|
||||
|
||||
void bli_invertv_kernel( obj_t* x )
|
||||
{
|
||||
num_t dt_x = bli_obj_datatype( *x );
|
||||
|
||||
dim_t n = bli_obj_vector_dim( *x );
|
||||
|
||||
inc_t inc_x = bli_obj_vector_inc( *x );
|
||||
void* buf_x = bli_obj_buffer_at_off( *x );
|
||||
|
||||
FUNCPTR_T f;
|
||||
|
||||
// Index into the type combination array to extract the correct
|
||||
// function pointer.
|
||||
f = ftypes[dt_x];
|
||||
|
||||
// Invoke the function.
|
||||
f( n,
|
||||
buf_x, inc_x );
|
||||
}
|
||||
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, varname, kername ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
dim_t n, \
|
||||
void* x, inc_t incx \
|
||||
) \
|
||||
{ \
|
||||
PASTEMAC(ch,kername)( n, \
|
||||
x, incx ); \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC( invertv_kernel_void, INVERTV_KERNEL )
|
||||
|
||||
@@ -32,9 +32,13 @@
|
||||
|
||||
*/
|
||||
|
||||
void bli_invertv_unb_var1( obj_t* x );
|
||||
void bli_invertv_kernel( obj_t* x );
|
||||
|
||||
|
||||
//
|
||||
// Prototype the void pointer kernel wrappers.
|
||||
//
|
||||
|
||||
#undef GENTPROT
|
||||
#define GENTPROT( ctype, ch, varname ) \
|
||||
\
|
||||
@@ -43,5 +47,5 @@ void PASTEMAC(ch,varname)( \
|
||||
void* x, inc_t incx \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT_BASIC( invertv_unb_var1 )
|
||||
INSERT_GENTPROT_BASIC( invertv_kernel_void )
|
||||
|
||||
@@ -34,6 +34,7 @@
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
/*
|
||||
#define FUNCPTR_T invertv_fp
|
||||
|
||||
typedef void (*FUNCPTR_T)(
|
||||
@@ -41,10 +42,10 @@ typedef void (*FUNCPTR_T)(
|
||||
void* x, inc_t incx
|
||||
);
|
||||
|
||||
static FUNCPTR_T GENARRAY(ftypes,invertv_unb_var1);
|
||||
static FUNCPTR_T GENARRAY(ftypes,invertv_ref);
|
||||
|
||||
|
||||
void bli_invertv_unb_var1( obj_t* x )
|
||||
void bli_invertv_ref( obj_t* x )
|
||||
{
|
||||
num_t dt_x = bli_obj_datatype( *x );
|
||||
|
||||
@@ -63,15 +64,17 @@ void bli_invertv_unb_var1( obj_t* x )
|
||||
f( n,
|
||||
buf_x, inc_x );
|
||||
}
|
||||
*/
|
||||
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, opname, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
dim_t n, \
|
||||
void* x, inc_t incx \
|
||||
) \
|
||||
void PASTEMAC(ch,varname) \
|
||||
( \
|
||||
dim_t n, \
|
||||
ctype* restrict x, inc_t incx \
|
||||
) \
|
||||
{ \
|
||||
ctype* x_cast = x; \
|
||||
ctype* chi1; \
|
||||
@@ -89,5 +92,5 @@ void PASTEMAC(ch,varname)( \
|
||||
} \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC( invertv, invertv_unb_var1 )
|
||||
INSERT_GENTFUNC_BASIC( invertv, invertv_ref )
|
||||
|
||||
@@ -32,16 +32,19 @@
|
||||
|
||||
*/
|
||||
|
||||
/*
|
||||
void bli_invertv_ref( obj_t* x );
|
||||
*/
|
||||
|
||||
|
||||
#undef GENTPROT
|
||||
#define GENTPROT( ctype, ch, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
ctype* restrict a, \
|
||||
ctype* restrict b, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
auxinfo_t* data \
|
||||
);
|
||||
void PASTEMAC(ch,varname) \
|
||||
( \
|
||||
dim_t n, \
|
||||
ctype* restrict x, inc_t incx \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT_BASIC( trsm_l_ref_4x4 )
|
||||
INSERT_GENTPROT_BASIC( invertv_ref )
|
||||
|
||||
@@ -68,7 +68,7 @@ void PASTEMAC0(opname)( \
|
||||
y ); \
|
||||
}
|
||||
|
||||
GENFRONT( scal2v, SCAL2V_KERNEL )
|
||||
GENFRONT( scal2v, scal2v_kernel )
|
||||
|
||||
|
||||
//
|
||||
|
||||
@@ -33,7 +33,8 @@
|
||||
*/
|
||||
|
||||
#include "bli_scal2v_check.h"
|
||||
#include "bli_scal2v_unb_var1.h"
|
||||
#include "bli_scal2v_kernel.h"
|
||||
#include "bli_scal2v_ref.h"
|
||||
|
||||
|
||||
//
|
||||
|
||||
129
frame/1/scal2v/bli_scal2v_kernel.c
Normal file
129
frame/1/scal2v/bli_scal2v_kernel.c
Normal file
@@ -0,0 +1,129 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
#define FUNCPTR_T scal2v_fp
|
||||
|
||||
typedef void (*FUNCPTR_T)(
|
||||
conj_t conjx,
|
||||
dim_t n,
|
||||
void* beta,
|
||||
void* x, inc_t incx,
|
||||
void* y, inc_t incy
|
||||
);
|
||||
|
||||
// If some mixed datatype functions will not be compiled, we initialize
|
||||
// the corresponding elements of the function array to NULL.
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
static FUNCPTR_T GENARRAY3_ALL(ftypes,scal2v_kernel_void);
|
||||
#else
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
static FUNCPTR_T GENARRAY3_EXT(ftypes,scal2v_kernel_void);
|
||||
#else
|
||||
static FUNCPTR_T GENARRAY3_MIN(ftypes,scal2v_kernel_void);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
void bli_scal2v_kernel( obj_t* beta,
|
||||
obj_t* x,
|
||||
obj_t* y )
|
||||
{
|
||||
num_t dt_x = bli_obj_datatype( *x );
|
||||
num_t dt_y = bli_obj_datatype( *y );
|
||||
|
||||
conj_t conjx = bli_obj_conj_status( *x );
|
||||
dim_t n = bli_obj_vector_dim( *x );
|
||||
|
||||
inc_t inc_x = bli_obj_vector_inc( *x );
|
||||
void* buf_x = bli_obj_buffer_at_off( *x );
|
||||
|
||||
inc_t inc_y = bli_obj_vector_inc( *y );
|
||||
void* buf_y = bli_obj_buffer_at_off( *y );
|
||||
|
||||
num_t dt_beta;
|
||||
void* buf_beta;
|
||||
|
||||
FUNCPTR_T f;
|
||||
|
||||
// If beta is a scalar constant, use dt_x to extract the address of the
|
||||
// corresponding constant value; otherwise, use the datatype encoded
|
||||
// within the beta object and extract the buffer at the beta offset.
|
||||
bli_set_scalar_dt_buffer( beta, dt_x, dt_beta, buf_beta );
|
||||
|
||||
// Index into the type combination array to extract the correct
|
||||
// function pointer.
|
||||
f = ftypes[dt_beta][dt_x][dt_y];
|
||||
|
||||
// Invoke the function.
|
||||
f( conjx,
|
||||
n,
|
||||
buf_beta,
|
||||
buf_x, inc_x,
|
||||
buf_y, inc_y );
|
||||
}
|
||||
|
||||
|
||||
#undef GENTFUNC3
|
||||
#define GENTFUNC3( ctype_b, ctype_x, ctype_y, chb, chx, chy, varname, kername ) \
|
||||
\
|
||||
void PASTEMAC3(chb,chx,chy,varname)( \
|
||||
conj_t conjx, \
|
||||
dim_t n, \
|
||||
void* beta, \
|
||||
void* x, inc_t incx, \
|
||||
void* y, inc_t incy \
|
||||
) \
|
||||
{ \
|
||||
PASTEMAC3(chb,chx,chy,kername)( conjx, \
|
||||
n, \
|
||||
beta, \
|
||||
x, incx, \
|
||||
y, incy ); \
|
||||
}
|
||||
|
||||
|
||||
// Define the basic set of functions unconditionally, and then also some
|
||||
// mixed datatype functions if requested.
|
||||
INSERT_GENTFUNC3_BASIC( scal2v_kernel_void, SCAL2V_KERNEL )
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
INSERT_GENTFUNC3_MIX_D( scal2v_kernel_void, SCAL2V_KERNEL )
|
||||
#endif
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
INSERT_GENTFUNC3_MIX_P( scal2v_kernel_void, SCAL2V_KERNEL )
|
||||
#endif
|
||||
|
||||
@@ -32,11 +32,15 @@
|
||||
|
||||
*/
|
||||
|
||||
void bli_scal2v_unb_var1( obj_t* beta,
|
||||
obj_t* x,
|
||||
obj_t* y );
|
||||
void bli_scal2v_kernel( obj_t* beta,
|
||||
obj_t* x,
|
||||
obj_t* y );
|
||||
|
||||
|
||||
//
|
||||
// Prototype the void pointer kernel wrappers.
|
||||
//
|
||||
|
||||
#undef GENTPROT3
|
||||
#define GENTPROT3( ctype_b, ctype_x, ctype_y, chb, chx, chy, varname ) \
|
||||
\
|
||||
@@ -48,13 +52,13 @@ void PASTEMAC3(chb,chx,chy,varname)( \
|
||||
void* y, inc_t incy \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT3_BASIC( scal2v_unb_var1 )
|
||||
INSERT_GENTPROT3_BASIC( scal2v_kernel_void )
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
INSERT_GENTPROT3_MIX_D( scal2v_unb_var1 )
|
||||
INSERT_GENTPROT3_MIX_D( scal2v_kernel_void )
|
||||
#endif
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
INSERT_GENTPROT3_MIX_P( scal2v_unb_var1 )
|
||||
INSERT_GENTPROT3_MIX_P( scal2v_kernel_void )
|
||||
#endif
|
||||
|
||||
@@ -34,6 +34,7 @@
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
/*
|
||||
#define FUNCPTR_T scal2v_fp
|
||||
|
||||
typedef void (*FUNCPTR_T)(
|
||||
@@ -47,17 +48,17 @@ typedef void (*FUNCPTR_T)(
|
||||
// If some mixed datatype functions will not be compiled, we initialize
|
||||
// the corresponding elements of the function array to NULL.
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
static FUNCPTR_T GENARRAY3_ALL(ftypes,scal2v_unb_var1);
|
||||
static FUNCPTR_T GENARRAY3_ALL(ftypes,scal2v_ref);
|
||||
#else
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
static FUNCPTR_T GENARRAY3_EXT(ftypes,scal2v_unb_var1);
|
||||
static FUNCPTR_T GENARRAY3_EXT(ftypes,scal2v_ref);
|
||||
#else
|
||||
static FUNCPTR_T GENARRAY3_MIN(ftypes,scal2v_unb_var1);
|
||||
static FUNCPTR_T GENARRAY3_MIN(ftypes,scal2v_ref);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
void bli_scal2v_unb_var1( obj_t* beta,
|
||||
void bli_scal2v_ref( obj_t* beta,
|
||||
obj_t* x,
|
||||
obj_t* y )
|
||||
{
|
||||
@@ -94,18 +95,20 @@ void bli_scal2v_unb_var1( obj_t* beta,
|
||||
buf_x, inc_x,
|
||||
buf_y, inc_y );
|
||||
}
|
||||
*/
|
||||
|
||||
|
||||
#undef GENTFUNC3
|
||||
#define GENTFUNC3( ctype_b, ctype_x, ctype_y, chb, chx, chy, varname, setvker ) \
|
||||
\
|
||||
void PASTEMAC3(chb,chx,chy,varname)( \
|
||||
conj_t conjx, \
|
||||
dim_t n, \
|
||||
void* beta, \
|
||||
void* x, inc_t incx, \
|
||||
void* y, inc_t incy \
|
||||
) \
|
||||
void PASTEMAC3(chb,chx,chy,varname) \
|
||||
( \
|
||||
conj_t conjx, \
|
||||
dim_t n, \
|
||||
ctype_b* restrict beta, \
|
||||
ctype_x* restrict x, inc_t incx, \
|
||||
ctype_y* restrict y, inc_t incy \
|
||||
) \
|
||||
{ \
|
||||
ctype_b* beta_cast = beta; \
|
||||
ctype_x* x_cast = x; \
|
||||
@@ -155,13 +158,13 @@ void PASTEMAC3(chb,chx,chy,varname)( \
|
||||
|
||||
// Define the basic set of functions unconditionally, and then also some
|
||||
// mixed datatype functions if requested.
|
||||
INSERT_GENTFUNC3_BASIC( scal2v_unb_var1, SETV_KERNEL )
|
||||
INSERT_GENTFUNC3_BASIC( scal2v_ref, SETV_KERNEL )
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
INSERT_GENTFUNC3_MIX_D( scal2v_unb_var1, SETV_KERNEL )
|
||||
INSERT_GENTFUNC3_MIX_D( scal2v_ref, SETV_KERNEL )
|
||||
#endif
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
INSERT_GENTFUNC3_MIX_P( scal2v_unb_var1, SETV_KERNEL )
|
||||
INSERT_GENTFUNC3_MIX_P( scal2v_ref, SETV_KERNEL )
|
||||
#endif
|
||||
|
||||
63
frame/1/scal2v/bli_scal2v_ref.h
Normal file
63
frame/1/scal2v/bli_scal2v_ref.h
Normal file
@@ -0,0 +1,63 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
/*
|
||||
void bli_scal2v_ref( obj_t* beta,
|
||||
obj_t* x,
|
||||
obj_t* y );
|
||||
*/
|
||||
|
||||
|
||||
#undef GENTPROT3
|
||||
#define GENTPROT3( ctype_b, ctype_x, ctype_y, chb, chx, chy, varname ) \
|
||||
\
|
||||
void PASTEMAC3(chb,chx,chy,varname) \
|
||||
( \
|
||||
conj_t conjx, \
|
||||
dim_t n, \
|
||||
ctype_b* restrict beta, \
|
||||
ctype_x* restrict x, inc_t incx, \
|
||||
ctype_y* restrict y, inc_t incy \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT3_BASIC( scal2v_ref )
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
INSERT_GENTPROT3_MIX_D( scal2v_ref )
|
||||
#endif
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
INSERT_GENTPROT3_MIX_P( scal2v_ref )
|
||||
#endif
|
||||
|
||||
@@ -66,7 +66,7 @@ void PASTEMAC0(opname)( \
|
||||
x ); \
|
||||
}
|
||||
|
||||
GENFRONT( scalv, SCALV_KERNEL )
|
||||
GENFRONT( scalv, scalv_kernel )
|
||||
|
||||
|
||||
//
|
||||
|
||||
@@ -36,7 +36,8 @@
|
||||
#include "bli_scalv_check.h"
|
||||
#include "bli_scalv_int.h"
|
||||
|
||||
#include "bli_scalv_unb_var1.h"
|
||||
#include "bli_scalv_kernel.h"
|
||||
#include "bli_scalv_ref.h"
|
||||
|
||||
|
||||
//
|
||||
|
||||
@@ -42,7 +42,7 @@ typedef void (*FUNCPTR_T)( obj_t* beta,
|
||||
static FUNCPTR_T vars[1][3] =
|
||||
{
|
||||
// unblocked optimized unblocked blocked
|
||||
{ bli_scalv_unb_var1, NULL, NULL }
|
||||
{ bli_scalv_kernel, bli_scalv_kernel, NULL }
|
||||
};
|
||||
|
||||
void bli_scalv_int( obj_t* beta,
|
||||
|
||||
120
frame/1/scalv/bli_scalv_kernel.c
Normal file
120
frame/1/scalv/bli_scalv_kernel.c
Normal file
@@ -0,0 +1,120 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
#define FUNCPTR_T scalv_fp
|
||||
|
||||
typedef void (*FUNCPTR_T)(
|
||||
conj_t conjbeta,
|
||||
dim_t n,
|
||||
void* beta,
|
||||
void* x, inc_t incx
|
||||
);
|
||||
|
||||
// If some mixed datatype functions will not be compiled, we initialize
|
||||
// the corresponding elements of the function array to NULL.
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
static FUNCPTR_T GENARRAY2_ALL(ftypes,scalv_kernel_void);
|
||||
#else
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
static FUNCPTR_T GENARRAY2_EXT(ftypes,scalv_kernel_void);
|
||||
#else
|
||||
static FUNCPTR_T GENARRAY2_MIN(ftypes,scalv_kernel_void);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
void bli_scalv_kernel( obj_t* beta,
|
||||
obj_t* x )
|
||||
{
|
||||
num_t dt_x = bli_obj_datatype( *x );
|
||||
|
||||
conj_t conjbeta = bli_obj_conj_status( *beta );
|
||||
|
||||
dim_t n = bli_obj_vector_dim( *x );
|
||||
|
||||
inc_t inc_x = bli_obj_vector_inc( *x );
|
||||
void* buf_x = bli_obj_buffer_at_off( *x );
|
||||
|
||||
num_t dt_beta;
|
||||
void* buf_beta;
|
||||
|
||||
FUNCPTR_T f;
|
||||
|
||||
// If beta is a scalar constant, use dt_x to extract the address of the
|
||||
// corresponding constant value; otherwise, use the datatype encoded
|
||||
// within the beta object and extract the buffer at the beta offset.
|
||||
bli_set_scalar_dt_buffer( beta, dt_x, dt_beta, buf_beta );
|
||||
|
||||
// Index into the type combination array to extract the correct
|
||||
// function pointer.
|
||||
f = ftypes[dt_beta][dt_x];
|
||||
|
||||
// Invoke the function.
|
||||
f( conjbeta,
|
||||
n,
|
||||
buf_beta,
|
||||
buf_x, inc_x );
|
||||
}
|
||||
|
||||
|
||||
#undef GENTFUNC2
|
||||
#define GENTFUNC2( ctype_b, ctype_x, chb, chx, varname, kername ) \
|
||||
\
|
||||
void PASTEMAC2(chb,chx,varname)( \
|
||||
conj_t conjbeta, \
|
||||
dim_t n, \
|
||||
void* beta, \
|
||||
void* x, inc_t incx \
|
||||
) \
|
||||
{ \
|
||||
PASTEMAC2(chb,chx,kername)( conjbeta, \
|
||||
n, \
|
||||
beta, \
|
||||
x, incx ); \
|
||||
}
|
||||
|
||||
// Define the basic set of functions unconditionally, and then also some
|
||||
// mixed datatype functions if requested.
|
||||
INSERT_GENTFUNC2_BASIC( scalv_kernel_void, SCALV_KERNEL )
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
INSERT_GENTFUNC2_MIX_D( scalv_kernel_void, SCALV_KERNEL )
|
||||
#endif
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
INSERT_GENTFUNC2_MIX_P( scalv_kernel_void, SCALV_KERNEL )
|
||||
#endif
|
||||
|
||||
@@ -32,10 +32,14 @@
|
||||
|
||||
*/
|
||||
|
||||
void bli_scalv_unb_var1( obj_t* beta,
|
||||
obj_t* x );
|
||||
void bli_scalv_kernel( obj_t* beta,
|
||||
obj_t* x );
|
||||
|
||||
|
||||
//
|
||||
// Prototype the void pointer kernel wrappers.
|
||||
//
|
||||
|
||||
#undef GENTPROT2
|
||||
#define GENTPROT2( ctype_b, ctype_x, chb, chx, varname ) \
|
||||
\
|
||||
@@ -46,13 +50,13 @@ void PASTEMAC2(chb,chx,varname)( \
|
||||
void* x, inc_t incx \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT2_BASIC( scalv_unb_var1 )
|
||||
INSERT_GENTPROT2_BASIC( scalv_kernel_void )
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
INSERT_GENTPROT2_MIX_D( scalv_unb_var1 )
|
||||
INSERT_GENTPROT2_MIX_D( scalv_kernel_void )
|
||||
#endif
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
INSERT_GENTPROT2_MIX_P( scalv_unb_var1 )
|
||||
INSERT_GENTPROT2_MIX_P( scalv_kernel_void )
|
||||
#endif
|
||||
|
||||
@@ -34,6 +34,7 @@
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
/*
|
||||
#define FUNCPTR_T scalv_fp
|
||||
|
||||
typedef void (*FUNCPTR_T)(
|
||||
@@ -46,17 +47,17 @@ typedef void (*FUNCPTR_T)(
|
||||
// If some mixed datatype functions will not be compiled, we initialize
|
||||
// the corresponding elements of the function array to NULL.
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
static FUNCPTR_T GENARRAY2_ALL(ftypes,scalv_unb_var1);
|
||||
static FUNCPTR_T GENARRAY2_ALL(ftypes,scalv_ref);
|
||||
#else
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
static FUNCPTR_T GENARRAY2_EXT(ftypes,scalv_unb_var1);
|
||||
static FUNCPTR_T GENARRAY2_EXT(ftypes,scalv_ref);
|
||||
#else
|
||||
static FUNCPTR_T GENARRAY2_MIN(ftypes,scalv_unb_var1);
|
||||
static FUNCPTR_T GENARRAY2_MIN(ftypes,scalv_ref);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
void bli_scalv_unb_var1( obj_t* beta,
|
||||
void bli_scalv_ref( obj_t* beta,
|
||||
obj_t* x )
|
||||
{
|
||||
num_t dt_x = bli_obj_datatype( *x );
|
||||
@@ -88,17 +89,19 @@ void bli_scalv_unb_var1( obj_t* beta,
|
||||
buf_beta,
|
||||
buf_x, inc_x );
|
||||
}
|
||||
*/
|
||||
|
||||
|
||||
#undef GENTFUNC2
|
||||
#define GENTFUNC2( ctype_b, ctype_x, chb, chx, varname, setvker ) \
|
||||
\
|
||||
void PASTEMAC2(chb,chx,varname)( \
|
||||
conj_t conjbeta, \
|
||||
dim_t n, \
|
||||
void* beta, \
|
||||
void* x, inc_t incx \
|
||||
) \
|
||||
void PASTEMAC2(chb,chx,varname) \
|
||||
( \
|
||||
conj_t conjbeta, \
|
||||
dim_t n, \
|
||||
ctype_b* restrict beta, \
|
||||
ctype_x* restrict x, inc_t incx \
|
||||
) \
|
||||
{ \
|
||||
ctype_b* beta_cast = beta; \
|
||||
ctype_x* x_cast = x; \
|
||||
@@ -136,13 +139,13 @@ void PASTEMAC2(chb,chx,varname)( \
|
||||
|
||||
// Define the basic set of functions unconditionally, and then also some
|
||||
// mixed datatype functions if requested.
|
||||
INSERT_GENTFUNC2_BASIC( scalv_unb_var1, SETV_KERNEL )
|
||||
INSERT_GENTFUNC2_BASIC( scalv_ref, SETV_KERNEL )
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
INSERT_GENTFUNC2_MIX_D( scalv_unb_var1, SETV_KERNEL )
|
||||
INSERT_GENTFUNC2_MIX_D( scalv_ref, SETV_KERNEL )
|
||||
#endif
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
INSERT_GENTFUNC2_MIX_P( scalv_unb_var1, SETV_KERNEL )
|
||||
INSERT_GENTFUNC2_MIX_P( scalv_ref, SETV_KERNEL )
|
||||
#endif
|
||||
|
||||
61
frame/1/scalv/bli_scalv_ref.h
Normal file
61
frame/1/scalv/bli_scalv_ref.h
Normal file
@@ -0,0 +1,61 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
/*
|
||||
void bli_scalv_ref( obj_t* beta,
|
||||
obj_t* x );
|
||||
*/
|
||||
|
||||
|
||||
#undef GENTPROT2
|
||||
#define GENTPROT2( ctype_b, ctype_x, chb, chx, varname ) \
|
||||
\
|
||||
void PASTEMAC2(chb,chx,varname) \
|
||||
( \
|
||||
conj_t conjbeta, \
|
||||
dim_t n, \
|
||||
ctype_b* restrict beta, \
|
||||
ctype_x* restrict x, inc_t incx \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT2_BASIC( scalv_ref )
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
INSERT_GENTPROT2_MIX_D( scalv_ref )
|
||||
#endif
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
INSERT_GENTPROT2_MIX_P( scalv_ref )
|
||||
#endif
|
||||
|
||||
@@ -67,7 +67,7 @@ void PASTEMAC0(opname)( \
|
||||
x ); \
|
||||
}
|
||||
|
||||
GENFRONT( setv, SETV_KERNEL )
|
||||
GENFRONT( setv, setv_kernel )
|
||||
|
||||
|
||||
//
|
||||
|
||||
@@ -34,8 +34,8 @@
|
||||
|
||||
#include "bli_setv_check.h"
|
||||
|
||||
#include "bli_setv_unb_var1.h"
|
||||
#include "bli_setv_unb_var2.h"
|
||||
#include "bli_setv_kernel.h"
|
||||
#include "bli_setv_ref.h"
|
||||
|
||||
|
||||
//
|
||||
|
||||
113
frame/1/setv/bli_setv_kernel.c
Normal file
113
frame/1/setv/bli_setv_kernel.c
Normal file
@@ -0,0 +1,113 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
#define FUNCPTR_T setv_fp
|
||||
|
||||
typedef void (*FUNCPTR_T)(
|
||||
dim_t n,
|
||||
void* beta,
|
||||
void* x, inc_t incx
|
||||
);
|
||||
|
||||
// If some mixed datatype functions will not be compiled, we initialize
|
||||
// the corresponding elements of the function array to NULL.
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
static FUNCPTR_T GENARRAY2_ALL(ftypes,setv_kernel_void);
|
||||
#else
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
static FUNCPTR_T GENARRAY2_EXT(ftypes,setv_kernel_void);
|
||||
#else
|
||||
static FUNCPTR_T GENARRAY2_MIN(ftypes,setv_kernel_void);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
void bli_setv_kernel( obj_t* beta,
|
||||
obj_t* x )
|
||||
{
|
||||
num_t dt_x = bli_obj_datatype( *x );
|
||||
|
||||
dim_t n = bli_obj_vector_dim( *x );
|
||||
|
||||
void* buf_x = bli_obj_buffer_at_off( *x );
|
||||
inc_t inc_x = bli_obj_vector_inc( *x );
|
||||
|
||||
void* buf_beta;
|
||||
num_t dt_beta;
|
||||
|
||||
FUNCPTR_T f;
|
||||
|
||||
// If beta is a scalar constant, use dt_x to extract the address of the
|
||||
// corresponding constant value; otherwise, use the datatype encoded
|
||||
// within the beta object and extract the buffer at the beta offset.
|
||||
bli_set_scalar_dt_buffer( beta, dt_x, dt_beta, buf_beta );
|
||||
|
||||
// Index into the type combination array to extract the correct
|
||||
// function pointer.
|
||||
f = ftypes[dt_beta][dt_x];
|
||||
|
||||
// Invoke the function.
|
||||
f( n,
|
||||
buf_beta,
|
||||
buf_x, inc_x );
|
||||
}
|
||||
|
||||
|
||||
#undef GENTFUNC2
|
||||
#define GENTFUNC2( ctype_b, ctype_x, chb, chx, varname, kername ) \
|
||||
\
|
||||
void PASTEMAC2(chb,chx,varname)( \
|
||||
dim_t n, \
|
||||
void* beta, \
|
||||
void* x, inc_t incx \
|
||||
) \
|
||||
{ \
|
||||
PASTEMAC2(chb,chx,kername)( n, \
|
||||
beta, \
|
||||
x, incx ); \
|
||||
}
|
||||
|
||||
// Define the basic set of functions unconditionally, and then also some
|
||||
// mixed datatype functions if requested.
|
||||
INSERT_GENTFUNC2_BASIC( setv_kernel_void, SETV_KERNEL )
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
INSERT_GENTFUNC2_MIX_D( setv_kernel_void, SETV_KERNEL )
|
||||
#endif
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
INSERT_GENTFUNC2_MIX_P( setv_kernel_void, SETV_KERNEL )
|
||||
#endif
|
||||
@@ -32,10 +32,14 @@
|
||||
|
||||
*/
|
||||
|
||||
void bli_setv_unb_var1( obj_t* beta,
|
||||
obj_t* x );
|
||||
void bli_setv_kernel( obj_t* beta,
|
||||
obj_t* x );
|
||||
|
||||
|
||||
//
|
||||
// Prototype the void pointer kernel wrappers.
|
||||
//
|
||||
|
||||
#undef GENTPROT2
|
||||
#define GENTPROT2( ctype_b, ctype_x, chb, chx, varname ) \
|
||||
\
|
||||
@@ -45,13 +49,13 @@ void PASTEMAC2(chb,chx,varname)( \
|
||||
void* x, inc_t incx \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT2_BASIC( setv_unb_var1 )
|
||||
INSERT_GENTPROT2_BASIC( setv_kernel_void )
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
INSERT_GENTPROT2_MIX_D( setv_unb_var1 )
|
||||
INSERT_GENTPROT2_MIX_D( setv_kernel_void )
|
||||
#endif
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
INSERT_GENTPROT2_MIX_P( setv_unb_var1 )
|
||||
INSERT_GENTPROT2_MIX_P( setv_kernel_void )
|
||||
#endif
|
||||
|
||||
@@ -34,6 +34,7 @@
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
/*
|
||||
#define FUNCPTR_T setv_fp
|
||||
|
||||
typedef void (*FUNCPTR_T)(
|
||||
@@ -45,17 +46,17 @@ typedef void (*FUNCPTR_T)(
|
||||
// If some mixed datatype functions will not be compiled, we initialize
|
||||
// the corresponding elements of the function array to NULL.
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
static FUNCPTR_T GENARRAY2_ALL(ftypes,setv_unb_var1);
|
||||
static FUNCPTR_T GENARRAY2_ALL(ftypes,setv_ref);
|
||||
#else
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
static FUNCPTR_T GENARRAY2_EXT(ftypes,setv_unb_var1);
|
||||
static FUNCPTR_T GENARRAY2_EXT(ftypes,setv_ref);
|
||||
#else
|
||||
static FUNCPTR_T GENARRAY2_MIN(ftypes,setv_unb_var1);
|
||||
static FUNCPTR_T GENARRAY2_MIN(ftypes,setv_ref);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
void bli_setv_unb_var1( obj_t* beta,
|
||||
void bli_setv_ref( obj_t* beta,
|
||||
obj_t* x )
|
||||
{
|
||||
num_t dt_x = bli_obj_datatype( *x );
|
||||
@@ -84,16 +85,17 @@ void bli_setv_unb_var1( obj_t* beta,
|
||||
buf_beta,
|
||||
buf_x, inc_x );
|
||||
}
|
||||
|
||||
*/
|
||||
|
||||
#undef GENTFUNC2
|
||||
#define GENTFUNC2( ctype_b, ctype_x, chb, chx, opname, varname ) \
|
||||
\
|
||||
void PASTEMAC2(chb,chx,varname)( \
|
||||
dim_t n, \
|
||||
void* beta, \
|
||||
void* x, inc_t incx \
|
||||
) \
|
||||
void PASTEMAC2(chb,chx,varname) \
|
||||
( \
|
||||
dim_t n, \
|
||||
ctype_b* restrict beta, \
|
||||
ctype_x* restrict x, inc_t incx \
|
||||
) \
|
||||
{ \
|
||||
ctype_b* beta_cast = beta; \
|
||||
ctype_x* chi1 = x; \
|
||||
@@ -123,12 +125,12 @@ void PASTEMAC2(chb,chx,varname)( \
|
||||
|
||||
// Define the basic set of functions unconditionally, and then also some
|
||||
// mixed datatype functions if requested.
|
||||
INSERT_GENTFUNC2_BASIC( setv, setv_unb_var1 )
|
||||
INSERT_GENTFUNC2_BASIC( setv, setv_ref )
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
INSERT_GENTFUNC2_MIX_D( setv, setv_unb_var1 )
|
||||
INSERT_GENTFUNC2_MIX_D( setv, setv_ref )
|
||||
#endif
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
INSERT_GENTFUNC2_MIX_P( setv, setv_unb_var1 )
|
||||
INSERT_GENTFUNC2_MIX_P( setv, setv_ref )
|
||||
#endif
|
||||
60
frame/1/setv/bli_setv_ref.h
Normal file
60
frame/1/setv/bli_setv_ref.h
Normal file
@@ -0,0 +1,60 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
/*
|
||||
void bli_setv_ref( obj_t* beta,
|
||||
obj_t* x );
|
||||
*/
|
||||
|
||||
|
||||
#undef GENTPROT2
|
||||
#define GENTPROT2( ctype_b, ctype_x, chb, chx, varname ) \
|
||||
\
|
||||
void PASTEMAC2(chb,chx,varname) \
|
||||
( \
|
||||
dim_t n, \
|
||||
ctype_b* restrict beta, \
|
||||
ctype_x* restrict x, inc_t incx \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT2_BASIC( setv_ref )
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
INSERT_GENTPROT2_MIX_D( setv_ref )
|
||||
#endif
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
INSERT_GENTPROT2_MIX_P( setv_ref )
|
||||
#endif
|
||||
|
||||
@@ -53,7 +53,7 @@ void PASTEMAC0(opname)( \
|
||||
y ); \
|
||||
}
|
||||
|
||||
GENFRONT( subv, SUBV_KERNEL )
|
||||
GENFRONT( subv, subv_kernel )
|
||||
|
||||
|
||||
//
|
||||
|
||||
@@ -33,7 +33,8 @@
|
||||
*/
|
||||
|
||||
#include "bli_subv_check.h"
|
||||
#include "bli_subv_unb_var1.h"
|
||||
#include "bli_subv_kernel.h"
|
||||
#include "bli_subv_ref.h"
|
||||
|
||||
|
||||
//
|
||||
|
||||
115
frame/1/subv/bli_subv_kernel.c
Normal file
115
frame/1/subv/bli_subv_kernel.c
Normal file
@@ -0,0 +1,115 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
#define FUNCPTR_T subv_fp
|
||||
|
||||
typedef void (*FUNCPTR_T)(
|
||||
conj_t conjx,
|
||||
dim_t n,
|
||||
void* x, inc_t incx,
|
||||
void* y, inc_t incy
|
||||
);
|
||||
|
||||
// If some mixed datatype functions will not be compiled, we initialize
|
||||
// the corresponding elements of the function array to NULL.
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
static FUNCPTR_T GENARRAY2_ALL(ftypes,subv_kernel_void);
|
||||
#else
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
static FUNCPTR_T GENARRAY2_EXT(ftypes,subv_kernel_void);
|
||||
#else
|
||||
static FUNCPTR_T GENARRAY2_MIN(ftypes,subv_kernel_void);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
void bli_subv_kernel( obj_t* x,
|
||||
obj_t* y )
|
||||
{
|
||||
num_t dt_x = bli_obj_datatype( *x );
|
||||
num_t dt_y = bli_obj_datatype( *y );
|
||||
|
||||
conj_t conjx = bli_obj_conj_status( *x );
|
||||
dim_t n = bli_obj_vector_dim( *x );
|
||||
|
||||
inc_t inc_x = bli_obj_vector_inc( *x );
|
||||
void* buf_x = bli_obj_buffer_at_off( *x );
|
||||
|
||||
inc_t inc_y = bli_obj_vector_inc( *y );
|
||||
void* buf_y = bli_obj_buffer_at_off( *y );
|
||||
|
||||
FUNCPTR_T f;
|
||||
|
||||
// Index into the type combination array to extract the correct
|
||||
// function pointer.
|
||||
f = ftypes[dt_x][dt_y];
|
||||
|
||||
// Invoke the function.
|
||||
f( conjx,
|
||||
n,
|
||||
buf_x, inc_x,
|
||||
buf_y, inc_y );
|
||||
}
|
||||
|
||||
|
||||
#undef GENTFUNC2
|
||||
#define GENTFUNC2( ctype_x, ctype_y, chx, chy, varname, kername ) \
|
||||
\
|
||||
void PASTEMAC2(chx,chy,varname)( \
|
||||
conj_t conjx, \
|
||||
dim_t n, \
|
||||
void* x, inc_t incx, \
|
||||
void* y, inc_t incy \
|
||||
) \
|
||||
{ \
|
||||
PASTEMAC2(chx,chy,kername)( conjx, \
|
||||
n, \
|
||||
x, incx, \
|
||||
y, incy ); \
|
||||
}
|
||||
|
||||
// Define the basic set of functions unconditionally, and then also some
|
||||
// mixed datatype functions if requested.
|
||||
INSERT_GENTFUNC2_BASIC( subv_kernel_void, SUBV_KERNEL )
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
INSERT_GENTFUNC2_MIX_D( subv_kernel_void, SUBV_KERNEL )
|
||||
#endif
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
INSERT_GENTFUNC2_MIX_P( subv_kernel_void, SUBV_KERNEL )
|
||||
#endif
|
||||
|
||||
@@ -32,10 +32,14 @@
|
||||
|
||||
*/
|
||||
|
||||
void bli_addv_unb_var1( obj_t* x,
|
||||
obj_t* y );
|
||||
void bli_subv_kernel( obj_t* x,
|
||||
obj_t* y );
|
||||
|
||||
|
||||
//
|
||||
// Prototype the void pointer kernel wrappers.
|
||||
//
|
||||
|
||||
#undef GENTPROT2
|
||||
#define GENTPROT2( ctype_x, ctype_y, chx, chy, varname ) \
|
||||
\
|
||||
@@ -46,12 +50,12 @@ void PASTEMAC2(chx,chy,varname)( \
|
||||
void* y, inc_t incy \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT2_BASIC( addv_unb_var1 )
|
||||
INSERT_GENTPROT2_BASIC( subv_kernel_void )
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
INSERT_GENTPROT2_MIX_D( addv_unb_var1 )
|
||||
INSERT_GENTPROT2_MIX_D( subv_kernel_void )
|
||||
#endif
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
INSERT_GENTPROT2_MIX_P( addv_unb_var1 )
|
||||
INSERT_GENTPROT2_MIX_P( subv_kernel_void )
|
||||
#endif
|
||||
@@ -34,6 +34,7 @@
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
/*
|
||||
#define FUNCPTR_T subv_fp
|
||||
|
||||
typedef void (*FUNCPTR_T)(
|
||||
@@ -46,17 +47,17 @@ typedef void (*FUNCPTR_T)(
|
||||
// If some mixed datatype functions will not be compiled, we initialize
|
||||
// the corresponding elements of the function array to NULL.
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
static FUNCPTR_T GENARRAY2_ALL(ftypes,subv_unb_var1);
|
||||
static FUNCPTR_T GENARRAY2_ALL(ftypes,subv_ref);
|
||||
#else
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
static FUNCPTR_T GENARRAY2_EXT(ftypes,subv_unb_var1);
|
||||
static FUNCPTR_T GENARRAY2_EXT(ftypes,subv_ref);
|
||||
#else
|
||||
static FUNCPTR_T GENARRAY2_MIN(ftypes,subv_unb_var1);
|
||||
static FUNCPTR_T GENARRAY2_MIN(ftypes,subv_ref);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
void bli_subv_unb_var1( obj_t* x,
|
||||
void bli_subv_ref( obj_t* x,
|
||||
obj_t* y )
|
||||
{
|
||||
num_t dt_x = bli_obj_datatype( *x );
|
||||
@@ -83,17 +84,19 @@ void bli_subv_unb_var1( obj_t* x,
|
||||
buf_x, inc_x,
|
||||
buf_y, inc_y );
|
||||
}
|
||||
*/
|
||||
|
||||
|
||||
#undef GENTFUNC2
|
||||
#define GENTFUNC2( ctype_x, ctype_y, chx, chy, opname, varname ) \
|
||||
\
|
||||
void PASTEMAC2(chx,chy,varname)( \
|
||||
conj_t conjx, \
|
||||
dim_t n, \
|
||||
void* x, inc_t incx, \
|
||||
void* y, inc_t incy \
|
||||
) \
|
||||
void PASTEMAC2(chx,chy,varname) \
|
||||
( \
|
||||
conj_t conjx, \
|
||||
dim_t n, \
|
||||
ctype_x* restrict x, inc_t incx, \
|
||||
ctype_y* restrict y, inc_t incy \
|
||||
) \
|
||||
{ \
|
||||
ctype_x* x_cast = x; \
|
||||
ctype_y* y_cast = y; \
|
||||
@@ -130,13 +133,13 @@ void PASTEMAC2(chx,chy,varname)( \
|
||||
|
||||
// Define the basic set of functions unconditionally, and then also some
|
||||
// mixed datatype functions if requested.
|
||||
INSERT_GENTFUNC2_BASIC( subv, subv_unb_var1 )
|
||||
INSERT_GENTFUNC2_BASIC( subv, subv_ref )
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
INSERT_GENTFUNC2_MIX_D( subv, subv_unb_var1 )
|
||||
INSERT_GENTFUNC2_MIX_D( subv, subv_ref )
|
||||
#endif
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
INSERT_GENTFUNC2_MIX_P( subv, subv_unb_var1 )
|
||||
INSERT_GENTFUNC2_MIX_P( subv, subv_ref )
|
||||
#endif
|
||||
|
||||
60
frame/1/subv/bli_subv_ref.h
Normal file
60
frame/1/subv/bli_subv_ref.h
Normal file
@@ -0,0 +1,60 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
/*
|
||||
void bli_subv_ref( obj_t* x,
|
||||
obj_t* y );
|
||||
*/
|
||||
|
||||
|
||||
#undef GENTPROT2
|
||||
#define GENTPROT2( ctype_x, ctype_y, chx, chy, varname ) \
|
||||
\
|
||||
void PASTEMAC2(chx,chy,varname) \
|
||||
( \
|
||||
conj_t conjx, \
|
||||
dim_t n, \
|
||||
ctype_x* restrict x, inc_t incx, \
|
||||
ctype_y* restrict y, inc_t incy \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT2_BASIC( subv_ref )
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
INSERT_GENTPROT2_MIX_D( subv_ref )
|
||||
#endif
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
INSERT_GENTPROT2_MIX_P( subv_ref )
|
||||
#endif
|
||||
@@ -53,7 +53,7 @@ void PASTEMAC0(opname)( \
|
||||
y ); \
|
||||
}
|
||||
|
||||
GENFRONT( swapv, SWAPV_KERNEL )
|
||||
GENFRONT( swapv, swapv_kernel )
|
||||
|
||||
|
||||
//
|
||||
|
||||
@@ -33,7 +33,8 @@
|
||||
*/
|
||||
|
||||
#include "bli_swapv_check.h"
|
||||
#include "bli_swapv_unb_var1.h"
|
||||
#include "bli_swapv_kernel.h"
|
||||
#include "bli_swapv_ref.h"
|
||||
|
||||
|
||||
//
|
||||
|
||||
110
frame/1/swapv/bli_swapv_kernel.c
Normal file
110
frame/1/swapv/bli_swapv_kernel.c
Normal file
@@ -0,0 +1,110 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
#define FUNCPTR_T swapv_fp
|
||||
|
||||
typedef void (*FUNCPTR_T)(
|
||||
dim_t n,
|
||||
void* x, inc_t incx,
|
||||
void* y, inc_t incy
|
||||
);
|
||||
|
||||
// If some mixed datatype functions will not be compiled, we initialize
|
||||
// the corresponding elements of the function array to NULL.
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
static FUNCPTR_T GENARRAY2_ALL(ftypes,swapv_kernel_void);
|
||||
#else
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
static FUNCPTR_T GENARRAY2_EXT(ftypes,swapv_kernel_void);
|
||||
#else
|
||||
static FUNCPTR_T GENARRAY2_MIN(ftypes,swapv_kernel_void);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
void bli_swapv_kernel( obj_t* x,
|
||||
obj_t* y )
|
||||
{
|
||||
num_t dt_x = bli_obj_datatype( *x );
|
||||
num_t dt_y = bli_obj_datatype( *y );
|
||||
|
||||
dim_t n = bli_obj_vector_dim( *x );
|
||||
|
||||
inc_t inc_x = bli_obj_vector_inc( *x );
|
||||
void* buf_x = bli_obj_buffer_at_off( *x );
|
||||
|
||||
inc_t inc_y = bli_obj_vector_inc( *y );
|
||||
void* buf_y = bli_obj_buffer_at_off( *y );
|
||||
|
||||
FUNCPTR_T f;
|
||||
|
||||
// Index into the type combination array to extract the correct
|
||||
// function pointer.
|
||||
f = ftypes[dt_x][dt_y];
|
||||
|
||||
// Invoke the function.
|
||||
f( n,
|
||||
buf_x, inc_x,
|
||||
buf_y, inc_y );
|
||||
}
|
||||
|
||||
|
||||
#undef GENTFUNC2
|
||||
#define GENTFUNC2( ctype_x, ctype_y, chx, chy, varname, kername ) \
|
||||
\
|
||||
void PASTEMAC2(chx,chy,varname)( \
|
||||
dim_t n, \
|
||||
void* x, inc_t incx, \
|
||||
void* y, inc_t incy \
|
||||
) \
|
||||
{ \
|
||||
PASTEMAC2(chx,chy,kername)( n, \
|
||||
x, incx, \
|
||||
y, incy ); \
|
||||
}
|
||||
|
||||
// Define the basic set of functions unconditionally, and then also some
|
||||
// mixed datatype functions if requested.
|
||||
INSERT_GENTFUNC2_BASIC( swapv_kernel_void, SWAPV_KERNEL )
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
INSERT_GENTFUNC2_MIX_D( swapv_kernel_void, SWAPV_KERNEL )
|
||||
#endif
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
INSERT_GENTFUNC2_MIX_P( swapv_kernel_void, SWAPV_KERNEL )
|
||||
#endif
|
||||
|
||||
@@ -32,10 +32,14 @@
|
||||
|
||||
*/
|
||||
|
||||
void bli_swapv_unb_var1( obj_t* x,
|
||||
obj_t* y );
|
||||
void bli_swapv_kernel( obj_t* x,
|
||||
obj_t* y );
|
||||
|
||||
|
||||
//
|
||||
// Prototype the void pointer kernel wrappers.
|
||||
//
|
||||
|
||||
#undef GENTPROT2
|
||||
#define GENTPROT2( ctype_x, ctype_y, chx, chy, varname ) \
|
||||
\
|
||||
@@ -45,12 +49,12 @@ void PASTEMAC2(chx,chy,varname)( \
|
||||
void* y, inc_t incy \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT2_BASIC( swapv_unb_var1 )
|
||||
INSERT_GENTPROT2_BASIC( swapv_kernel_void )
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
INSERT_GENTPROT2_MIX_D( swapv_unb_var1 )
|
||||
INSERT_GENTPROT2_MIX_D( swapv_kernel_void )
|
||||
#endif
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
INSERT_GENTPROT2_MIX_P( swapv_unb_var1 )
|
||||
INSERT_GENTPROT2_MIX_P( swapv_kernel_void )
|
||||
#endif
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user