Added extensive support for configuration defaults.

Details:
- Standard names for reference kernels (levels-1v, -1f and 3) are now
  macro constants. Examples:
    BLIS_SAXPYV_KERNEL_REF
    BLIS_DDOTXF_KERNEL_REF
    BLIS_ZGEMM_UKERNEL_REF
- Developers no longer have to name all datatype instances of a kernel
  with a common base name; [sdcz] datatype flavors of each kernel or
  micro-kernel (level-1v, -1f, or 3) may now be named independently.
  This means you can now, if you wish, encode the datatype-specific
  register blocksizes in the name of the micro-kernel functions.
- Any datatype instances of any kernel (1v, 1f, or 3) that is left
  undefined in bli_kernel.h will default to the corresponding reference
  implementation. For example, if BLIS_DGEMM_UKERNEL is left undefined,
  it will be defined to be BLIS_DGEMM_UKERNEL_REF.
- Developers no longer need to name level-1v/-1f kernels with multiple
  datatype chars to match the number of types the kernel WOULD take in
  a mixed type environment, as in bli_dddaxpyv_opt(). Now, one char is
  sufficient, as in bli_daxpyv_opt().
- There is no longer a need to define an obj_t wrapper to go along with
  your level-1v/-1f kernels. The framework now prvides a _kernel()
  function which serves as the obj_t wrapper for whatever kernels are
  specified (or defaulted to) via bli_kernel.h
- Developers no longer need to prototype their kernels, and thus no
  longer need to include any prototyping headers from within
  bli_kernel.h. The framework now generates kernel prototypes, with the
  proper type signature, based on the kernel names defined (or defaulted
  to) via bli_kernel.h.
- If the complex datatype x (of [cz]) implementation of the gemm micro-
  kernel is left undefined by bli_kernel.h, but its same-precision real
  domain equivalent IS defined, BLIS will use a 4m-based implementation
  for the datatype x implementations of all level-3 operations, using
  only the real gemm micro-kernel.
This commit is contained in:
Field G. Van Zee
2014-02-25 13:34:56 -06:00
parent 15b51e990f
commit fde5f1fdec
263 changed files with 8624 additions and 8356 deletions

View File

@@ -328,10 +328,12 @@ MK_BLIS_CONFIG_OBJS := $(patsubst $(CONFIG_PATH)/%.S, $(BASE_OBJ_CONFIG
$(filter %.S, $(MK_CONFIG_SRC)))
MK_BLIS_CONFIG_OBJS += $(patsubst $(CONFIG_PATH)/%.c, $(BASE_OBJ_CONFIG_PATH)/%.o, \
$(filter %.c, $(MK_CONFIG_SRC)))
MK_BLIS_CONFIG_NOOPT_OBJS := $(patsubst $(CONFIG_PATH)/%.S, $(BASE_OBJ_CONFIG_PATH)/%.o, \
$(filter %.S, $(MK_CONFIG_NOOPT_SRC)))
MK_BLIS_CONFIG_NOOPT_OBJS += $(patsubst $(CONFIG_PATH)/%.c, $(BASE_OBJ_CONFIG_PATH)/%.o, \
$(filter %.c, $(MK_CONFIG_NOOPT_SRC)))
MK_BLIS_CONFIG_KERNELS_OBJS := $(patsubst $(CONFIG_PATH)/%.S, $(BASE_OBJ_CONFIG_PATH)/%.o, \
$(filter %.S, $(MK_CONFIG_KERNELS_SRC)))
MK_BLIS_CONFIG_KERNELS_OBJS += $(patsubst $(CONFIG_PATH)/%.c, $(BASE_OBJ_CONFIG_PATH)/%.o, \

View File

@@ -38,7 +38,7 @@
// -- LEVEL-3 MICRO-KERNEL CONSTANTS -------------------------------------------
// -- Default cache blocksizes --
// -- Cache blocksizes --
//
// Constraints:
@@ -70,35 +70,7 @@
#define BLIS_DEFAULT_KC_Z 128
#define BLIS_DEFAULT_NC_Z 4096
// -- Cache blocksize extensions (for optimizing edge cases) --
// NOTE: These cache blocksize "extensions" have the same constraints as
// the corresponding default blocksizes above. When these values are
// non-zero, blocksizes used at edge cases are extended (enlarged) if
// such an extension would encompass the remaining portion of the
// matrix dimension.
#define BLIS_EXTEND_MC_S 0 //(BLIS_DEFAULT_MC_S/4)
#define BLIS_EXTEND_KC_S 0 //(BLIS_DEFAULT_KC_S/4)
#define BLIS_EXTEND_NC_S 0 //(BLIS_DEFAULT_NC_S/4)
#define BLIS_EXTEND_MC_D 0 //(BLIS_DEFAULT_MC_D/4)
#define BLIS_EXTEND_KC_D 0 //(BLIS_DEFAULT_KC_D/4)
#define BLIS_EXTEND_NC_D 0 //(BLIS_DEFAULT_NC_D/4)
#define BLIS_EXTEND_MC_C 0 //(BLIS_DEFAULT_MC_C/4)
#define BLIS_EXTEND_KC_C 0 //(BLIS_DEFAULT_KC_C/4)
#define BLIS_EXTEND_NC_C 0 //(BLIS_DEFAULT_NC_C/4)
#define BLIS_EXTEND_MC_Z 0 //(BLIS_DEFAULT_MC_Z/4)
#define BLIS_EXTEND_KC_Z 0 //(BLIS_DEFAULT_KC_Z/4)
#define BLIS_EXTEND_NC_Z 0 //(BLIS_DEFAULT_NC_Z/4)
// -- Default register blocksizes for micro-kernel --
// NOTE: When using the reference configuration, these register blocksizes
// in the m and n dimensions should all be equal to the size expected by
// the reference micro-kernel(s).
// -- Register blocksizes --
#define BLIS_DEFAULT_MR_S 4
#define BLIS_DEFAULT_NR_S 4
@@ -116,10 +88,34 @@
// of f, handles leftover edge cases (ie: when k % f > 0) then these
// register blocksizes in the k dimension can be defined to 1.
#define BLIS_DEFAULT_KR_S 1
#define BLIS_DEFAULT_KR_D 1
#define BLIS_DEFAULT_KR_C 1
#define BLIS_DEFAULT_KR_Z 1
//#define BLIS_DEFAULT_KR_S 1
//#define BLIS_DEFAULT_KR_D 1
//#define BLIS_DEFAULT_KR_C 1
//#define BLIS_DEFAULT_KR_Z 1
// -- Cache blocksize extensions (for optimizing edge cases) --
// NOTE: These cache blocksize "extensions" have the same constraints as
// the corresponding default blocksizes above. When these values are
// non-zero, blocksizes used at edge cases are extended (enlarged) if
// such an extension would encompass the remaining portion of the
// matrix dimension.
//#define BLIS_EXTEND_MC_S 0 //(BLIS_DEFAULT_MC_S/4)
//#define BLIS_EXTEND_KC_S 0 //(BLIS_DEFAULT_KC_S/4)
//#define BLIS_EXTEND_NC_S 0 //(BLIS_DEFAULT_NC_S/4)
//#define BLIS_EXTEND_MC_D 0 //(BLIS_DEFAULT_MC_D/4)
//#define BLIS_EXTEND_KC_D 0 //(BLIS_DEFAULT_KC_D/4)
//#define BLIS_EXTEND_NC_D 0 //(BLIS_DEFAULT_NC_D/4)
//#define BLIS_EXTEND_MC_C 0 //(BLIS_DEFAULT_MC_C/4)
//#define BLIS_EXTEND_KC_C 0 //(BLIS_DEFAULT_KC_C/4)
//#define BLIS_EXTEND_NC_C 0 //(BLIS_DEFAULT_NC_C/4)
//#define BLIS_EXTEND_MC_Z 0 //(BLIS_DEFAULT_MC_Z/4)
//#define BLIS_EXTEND_KC_Z 0 //(BLIS_DEFAULT_KC_Z/4)
//#define BLIS_EXTEND_NC_Z 0 //(BLIS_DEFAULT_NC_Z/4)
// -- Register blocksize extensions (for packed micro-panels) --
@@ -127,105 +123,28 @@
// leading dimensions used within the packed micro-panels are equal to
// or greater than their corresponding register blocksizes above.
#define BLIS_EXTEND_MR_S 0
#define BLIS_EXTEND_NR_S 0
//#define BLIS_EXTEND_MR_S 0
//#define BLIS_EXTEND_NR_S 0
#define BLIS_EXTEND_MR_D 0
#define BLIS_EXTEND_NR_D 0
//#define BLIS_EXTEND_MR_D 0
//#define BLIS_EXTEND_NR_D 0
#define BLIS_EXTEND_MR_C 0
#define BLIS_EXTEND_NR_C 0
//#define BLIS_EXTEND_MR_C 0
//#define BLIS_EXTEND_NR_C 0
#define BLIS_EXTEND_MR_Z 0
#define BLIS_EXTEND_NR_Z 0
//#define BLIS_EXTEND_MR_Z 0
//#define BLIS_EXTEND_NR_Z 0
// Register blocksize extensions in the k dimension are not used.
#define BLIS_EXTEND_KR_S 0
#define BLIS_EXTEND_KR_D 0
#define BLIS_EXTEND_KR_C 0
#define BLIS_EXTEND_KR_Z 0
// -- Default incremental packing blocksizes (n dimension) --
// NOTE: These incremental packing blocksizes (for the n dimension) are only
// used by certain blocked variants. But when the *are* used, they MUST be
// be an integer multiple of NR!
#define BLIS_DEFAULT_NI_FAC 16
#define BLIS_DEFAULT_NI_S (BLIS_DEFAULT_NI_FAC * BLIS_DEFAULT_NR_S)
#define BLIS_DEFAULT_NI_D (BLIS_DEFAULT_NI_FAC * BLIS_DEFAULT_NR_D)
#define BLIS_DEFAULT_NI_C (BLIS_DEFAULT_NI_FAC * BLIS_DEFAULT_NR_C)
#define BLIS_DEFAULT_NI_Z (BLIS_DEFAULT_NI_FAC * BLIS_DEFAULT_NR_Z)
// -- LEVEL-2 KERNEL CONSTANTS -------------------------------------------------
// NOTE: These values determine high-level cache blocking for level-2
// operations ONLY. So, if gemv is performed with a 2000x2000 matrix A and
// MC = NC = 1000, then a total of four unblocked (or unblocked fused)
// gemv subproblems are called. The blocked algorithms are only useful in
// that they provide the opportunity for packing vectors. (Matrices can also
// be packed here, but this tends to be much too expensive in practice to
// actually employ.)
#define BLIS_DEFAULT_L2_MC_S 1000
#define BLIS_DEFAULT_L2_NC_S 1000
#define BLIS_DEFAULT_L2_MC_D 1000
#define BLIS_DEFAULT_L2_NC_D 1000
#define BLIS_DEFAULT_L2_MC_C 1000
#define BLIS_DEFAULT_L2_NC_C 1000
#define BLIS_DEFAULT_L2_MC_Z 1000
#define BLIS_DEFAULT_L2_NC_Z 1000
// -- LEVEL-1F KERNEL CONSTANTS ------------------------------------------------
// -- Default fusing factors for level-1f operations --
// NOTE: Default fusing factors are not used by the reference implementations
// of level-1f operations. They are here only for use when these operations
// are optimized.
#define BLIS_DEFAULT_FUSE_FAC_S 8
#define BLIS_DEFAULT_FUSE_FAC_D 4
#define BLIS_DEFAULT_FUSE_FAC_C 4
#define BLIS_DEFAULT_FUSE_FAC_Z 2
#define BLIS_AXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
#define BLIS_AXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
#define BLIS_AXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
#define BLIS_AXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
#define BLIS_DOTXF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
#define BLIS_DOTXF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
#define BLIS_DOTXF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
#define BLIS_DOTXF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
#define BLIS_DOTXAXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
#define BLIS_DOTXAXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
#define BLIS_DOTXAXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
#define BLIS_DOTXAXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
// -- LEVEL-1V KERNEL CONSTANTS ------------------------------------------------
// -- Default register blocksizes for vectors --
// NOTE: Register blocksizes for vectors are used when packing
// non-contiguous vectors. Similar to that of KR, they can
// typically be set to 1.
#define BLIS_DEFAULT_VR_S 1
#define BLIS_DEFAULT_VR_D 1
#define BLIS_DEFAULT_VR_C 1
#define BLIS_DEFAULT_VR_Z 1
@@ -233,16 +152,13 @@
// -- gemm --
#include "bli_gemm_opt_4x4.h"
#define GEMM_UKERNEL gemm_opt_4x4
#define BLIS_SGEMM_UKERNEL bli_sgemm_opt_4x4
#define BLIS_DGEMM_UKERNEL bli_dgemm_opt_4x4
#define BLIS_CGEMM_UKERNEL bli_cgemm_opt_4x4
#define BLIS_ZGEMM_UKERNEL bli_zgemm_opt_4x4
// -- trsm-related --
#define GEMMTRSM_L_UKERNEL gemmtrsm_l_ref_mxn
#define GEMMTRSM_U_UKERNEL gemmtrsm_u_ref_mxn
#define TRSM_L_UKERNEL trsm_l_ref_mxn
#define TRSM_U_UKERNEL trsm_u_ref_mxn
@@ -250,25 +166,8 @@
// -- packm --
#define PACKM_2XK_KERNEL packm_ref_2xk
#define PACKM_4XK_KERNEL packm_ref_4xk
#define PACKM_6XK_KERNEL packm_ref_6xk
#define PACKM_8XK_KERNEL packm_ref_8xk
#define PACKM_10XK_KERNEL packm_ref_10xk
#define PACKM_12XK_KERNEL packm_ref_12xk
#define PACKM_14XK_KERNEL packm_ref_14xk
#define PACKM_16XK_KERNEL packm_ref_16xk
// -- unpackm --
#define UNPACKM_2XK_KERNEL unpackm_ref_2xk
#define UNPACKM_4XK_KERNEL unpackm_ref_4xk
#define UNPACKM_6XK_KERNEL unpackm_ref_6xk
#define UNPACKM_8XK_KERNEL unpackm_ref_8xk
#define UNPACKM_10XK_KERNEL unpackm_ref_10xk
#define UNPACKM_12XK_KERNEL unpackm_ref_12xk
#define UNPACKM_14XK_KERNEL unpackm_ref_14xk
#define UNPACKM_16XK_KERNEL unpackm_ref_16xk
@@ -276,23 +175,14 @@
// -- axpy2v --
#define AXPY2V_KERNEL axpy2v_unb_var1
// -- dotaxpyv --
#define DOTAXPYV_KERNEL dotaxpyv_unb_var1
// -- axpyf --
#define AXPYF_KERNEL axpyf_unb_var1
// -- dotxf --
#define DOTXF_KERNEL dotxf_unb_var1
// -- dotxaxpyf --
#define DOTXAXPYF_KERNEL dotxaxpyf_unb_var1
@@ -300,48 +190,26 @@
// -- addv --
#define ADDV_KERNEL addv_unb_var1
// -- axpyv --
#define AXPYV_KERNEL axpyv_unb_var1
// -- copyv --
#define COPYV_KERNEL copyv_unb_var1
// -- dotv --
#define DOTV_KERNEL dotv_unb_var1
// -- dotxv --
#define DOTXV_KERNEL dotxv_unb_var1
// -- invertv --
#define INVERTV_KERNEL invertv_unb_var1
// -- scal2v --
#define SCAL2V_KERNEL scal2v_unb_var1
// -- scalv --
#define SCALV_KERNEL scalv_unb_var1
// -- setv --
#define SETV_KERNEL setv_unb_var1
// -- subv --
#define SUBV_KERNEL subv_unb_var1
// -- swapv --
#define SWAPV_KERNEL swapv_unb_var1
#endif

1
config/armv7a/kernels Symbolic link
View File

@@ -0,0 +1 @@
../../kernels/armv7a

View File

@@ -1,53 +0,0 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
// #include "arm_neon.h"
#undef GENTPROT
#define GENTPROT( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname)( \
dim_t k, \
ctype* restrict alpha, \
ctype* restrict a, \
ctype* restrict b, \
ctype* restrict beta, \
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
auxinfo_t* data \
);
INSERT_GENTPROT_BASIC( gemm_opt_4x4 )

View File

@@ -36,6 +36,9 @@
#define BLIS_CONFIG_H
#undef restrict
// -- OPERATING SYSTEM ---------------------------------------------------------

View File

@@ -38,7 +38,7 @@
// -- LEVEL-3 MICRO-KERNEL CONSTANTS -------------------------------------------
// -- Default cache blocksizes --
// -- Cache blocksizes --
//
// Constraints:
@@ -76,35 +76,7 @@
#define BLIS_DEFAULT_KC_Z 256
#define BLIS_DEFAULT_NC_Z 2048
// -- Cache blocksize extensions (for optimizing edge cases) --
// NOTE: These cache blocksize "extensions" have the same constraints as
// the corresponding default blocksizes above. When these values are
// non-zero, blocksizes used at edge cases are extended (enlarged) if
// such an extension would encompass the remaining portion of the
// matrix dimension.
#define BLIS_EXTEND_MC_S 0 //(BLIS_DEFAULT_MC_S/4)
#define BLIS_EXTEND_KC_S 0 //(BLIS_DEFAULT_KC_S/4)
#define BLIS_EXTEND_NC_S 0 //(BLIS_DEFAULT_NC_S/4)
#define BLIS_EXTEND_MC_D 0 //(BLIS_DEFAULT_MC_D/4)
#define BLIS_EXTEND_KC_D 0 //(BLIS_DEFAULT_KC_D/4)
#define BLIS_EXTEND_NC_D 0 //(BLIS_DEFAULT_NC_D/4)
#define BLIS_EXTEND_MC_C 0 //(BLIS_DEFAULT_MC_C/4)
#define BLIS_EXTEND_KC_C 0 //(BLIS_DEFAULT_KC_C/4)
#define BLIS_EXTEND_NC_C 0 //(BLIS_DEFAULT_NC_C/4)
#define BLIS_EXTEND_MC_Z 0 //(BLIS_DEFAULT_MC_Z/4)
#define BLIS_EXTEND_KC_Z 0 //(BLIS_DEFAULT_KC_Z/4)
#define BLIS_EXTEND_NC_Z 0 //(BLIS_DEFAULT_NC_Z/4)
// -- Default register blocksizes for micro-kernel --
// NOTE: When using the reference configuration, these register blocksizes
// in the m and n dimensions should all be equal to the size expected by
// the reference micro-kernel(s).
// -- Register blocksizes --
#define BLIS_DEFAULT_MR_S 8
#define BLIS_DEFAULT_NR_S 4
@@ -122,10 +94,34 @@
// of f, handles leftover edge cases (ie: when k % f > 0) then these
// register blocksizes in the k dimension can be defined to 1.
#define BLIS_DEFAULT_KR_S 1
#define BLIS_DEFAULT_KR_D 1
#define BLIS_DEFAULT_KR_C 1
#define BLIS_DEFAULT_KR_Z 1
//#define BLIS_DEFAULT_KR_S 1
//#define BLIS_DEFAULT_KR_D 1
//#define BLIS_DEFAULT_KR_C 1
//#define BLIS_DEFAULT_KR_Z 1
// -- Cache blocksize extensions (for optimizing edge cases) --
// NOTE: These cache blocksize "extensions" have the same constraints as
// the corresponding default blocksizes above. When these values are
// non-zero, blocksizes used at edge cases are extended (enlarged) if
// such an extension would encompass the remaining portion of the
// matrix dimension.
//#define BLIS_EXTEND_MC_S 0 //(BLIS_DEFAULT_MC_S/4)
//#define BLIS_EXTEND_KC_S 0 //(BLIS_DEFAULT_KC_S/4)
//#define BLIS_EXTEND_NC_S 0 //(BLIS_DEFAULT_NC_S/4)
//#define BLIS_EXTEND_MC_D 0 //(BLIS_DEFAULT_MC_D/4)
//#define BLIS_EXTEND_KC_D 0 //(BLIS_DEFAULT_KC_D/4)
//#define BLIS_EXTEND_NC_D 0 //(BLIS_DEFAULT_NC_D/4)
//#define BLIS_EXTEND_MC_C 0 //(BLIS_DEFAULT_MC_C/4)
//#define BLIS_EXTEND_KC_C 0 //(BLIS_DEFAULT_KC_C/4)
//#define BLIS_EXTEND_NC_C 0 //(BLIS_DEFAULT_NC_C/4)
//#define BLIS_EXTEND_MC_Z 0 //(BLIS_DEFAULT_MC_Z/4)
//#define BLIS_EXTEND_KC_Z 0 //(BLIS_DEFAULT_KC_Z/4)
//#define BLIS_EXTEND_NC_Z 0 //(BLIS_DEFAULT_NC_Z/4)
// -- Register blocksize extensions (for packed micro-panels) --
@@ -133,48 +129,22 @@
// leading dimensions used within the packed micro-panels are equal to
// or greater than their corresponding register blocksizes above.
#define BLIS_EXTEND_MR_S 0
#define BLIS_EXTEND_NR_S 0
//#define BLIS_EXTEND_MR_S 0
//#define BLIS_EXTEND_NR_S 0
#define BLIS_EXTEND_MR_D 0
#define BLIS_EXTEND_NR_D 0
//#define BLIS_EXTEND_MR_D 0
//#define BLIS_EXTEND_NR_D 0
#define BLIS_EXTEND_MR_C 0
#define BLIS_EXTEND_NR_C 0
//#define BLIS_EXTEND_MR_C 0
//#define BLIS_EXTEND_NR_C 0
#define BLIS_EXTEND_MR_Z 0
#define BLIS_EXTEND_NR_Z 0
// Register blocksize extensions in the k dimension are not used.
#define BLIS_EXTEND_KR_S 0
#define BLIS_EXTEND_KR_D 0
#define BLIS_EXTEND_KR_C 0
#define BLIS_EXTEND_KR_Z 0
//#define BLIS_EXTEND_MR_Z 0
//#define BLIS_EXTEND_NR_Z 0
// -- LEVEL-2 KERNEL CONSTANTS -------------------------------------------------
// NOTE: These values determine high-level cache blocking for level-2
// operations ONLY. So, if gemv is performed with a 2000x2000 matrix A and
// MC = NC = 1000, then a total of four unblocked (or unblocked fused)
// gemv subproblems are called. The blocked algorithms are only useful in
// that they provide the opportunity for packing vectors. (Matrices can also
// be packed here, but this tends to be much too expensive in practice to
// actually employ.)
#define BLIS_DEFAULT_L2_MC_S 1000
#define BLIS_DEFAULT_L2_NC_S 1000
#define BLIS_DEFAULT_L2_MC_D 1000
#define BLIS_DEFAULT_L2_NC_D 1000
#define BLIS_DEFAULT_L2_MC_C 1000
#define BLIS_DEFAULT_L2_NC_C 1000
#define BLIS_DEFAULT_L2_MC_Z 1000
#define BLIS_DEFAULT_L2_NC_Z 1000
@@ -182,44 +152,26 @@
// -- Default fusing factors for level-1f operations --
// NOTE: Default fusing factors are not used by the reference implementations
// of level-1f operations. They are here only for use when these operations
// are optimized.
#define BLIS_L1F_FUSE_FAC_S 8
#define BLIS_L1F_FUSE_FAC_D 4
#define BLIS_L1F_FUSE_FAC_C 4
#define BLIS_L1F_FUSE_FAC_Z 2
#define BLIS_DEFAULT_FUSE_FAC_S 8
#define BLIS_DEFAULT_FUSE_FAC_D 4
#define BLIS_DEFAULT_FUSE_FAC_C 4
#define BLIS_DEFAULT_FUSE_FAC_Z 2
#define BLIS_AXPYF_FUSE_FAC_S BLIS_L1F_FUSE_FAC_S
#define BLIS_AXPYF_FUSE_FAC_D BLIS_L1F_FUSE_FAC_D
#define BLIS_AXPYF_FUSE_FAC_C BLIS_L1F_FUSE_FAC_C
#define BLIS_AXPYF_FUSE_FAC_Z BLIS_L1F_FUSE_FAC_Z
#define BLIS_AXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
#define BLIS_AXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
#define BLIS_AXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
#define BLIS_AXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
#define BLIS_DOTXF_FUSE_FAC_S BLIS_L1F_FUSE_FAC_S
#define BLIS_DOTXF_FUSE_FAC_D BLIS_L1F_FUSE_FAC_D
#define BLIS_DOTXF_FUSE_FAC_C BLIS_L1F_FUSE_FAC_C
#define BLIS_DOTXF_FUSE_FAC_Z BLIS_L1F_FUSE_FAC_Z
#define BLIS_DOTXF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
#define BLIS_DOTXF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
#define BLIS_DOTXF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
#define BLIS_DOTXF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
#define BLIS_DOTXAXPYF_FUSE_FAC_S BLIS_L1F_FUSE_FAC_S
#define BLIS_DOTXAXPYF_FUSE_FAC_D BLIS_L1F_FUSE_FAC_D
#define BLIS_DOTXAXPYF_FUSE_FAC_C BLIS_L1F_FUSE_FAC_C
#define BLIS_DOTXAXPYF_FUSE_FAC_Z BLIS_L1F_FUSE_FAC_Z
#define BLIS_DOTXAXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
#define BLIS_DOTXAXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
#define BLIS_DOTXAXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
#define BLIS_DOTXAXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
// -- LEVEL-1V KERNEL CONSTANTS ------------------------------------------------
// -- Default register blocksizes for vectors --
// NOTE: Register blocksizes for vectors are used when packing
// non-contiguous vectors. Similar to that of KR, they can
// typically be set to 1.
#define BLIS_DEFAULT_VR_S 1
#define BLIS_DEFAULT_VR_D 1
#define BLIS_DEFAULT_VR_C 1
#define BLIS_DEFAULT_VR_Z 1
@@ -229,16 +181,11 @@
#include "bli_gemm_8x8.h"
#define GEMM_UKERNEL gemm_8x8
#define GEMM_UKERNEL_MT gemm_8x8_mt
#define BLIS_DGEMM_UKERNEL bli_dgemm_8x8
#define BLIS_DGEMM_UKERNEL_MT bli_dgemm_8x8_mt
// -- trsm-related --
#define GEMMTRSM_L_UKERNEL gemmtrsm_l_ref_mxn
#define GEMMTRSM_U_UKERNEL gemmtrsm_u_ref_mxn
#define TRSM_L_UKERNEL trsm_l_ref_mxn
#define TRSM_U_UKERNEL trsm_u_ref_mxn
@@ -246,25 +193,8 @@
// -- packm --
#define PACKM_2XK_KERNEL packm_ref_2xk
#define PACKM_4XK_KERNEL packm_ref_4xk
#define PACKM_6XK_KERNEL packm_ref_6xk
#define PACKM_8XK_KERNEL packm_ref_8xk
#define PACKM_10XK_KERNEL packm_ref_10xk
#define PACKM_12XK_KERNEL packm_ref_12xk
#define PACKM_14XK_KERNEL packm_ref_14xk
#define PACKM_16XK_KERNEL packm_ref_16xk
// -- unpackm --
#define UNPACKM_2XK_KERNEL unpackm_ref_2xk
#define UNPACKM_4XK_KERNEL unpackm_ref_4xk
#define UNPACKM_6XK_KERNEL unpackm_ref_6xk
#define UNPACKM_8XK_KERNEL unpackm_ref_8xk
#define UNPACKM_10XK_KERNEL unpackm_ref_10xk
#define UNPACKM_12XK_KERNEL unpackm_ref_12xk
#define UNPACKM_14XK_KERNEL unpackm_ref_14xk
#define UNPACKM_16XK_KERNEL unpackm_ref_16xk
@@ -272,25 +202,16 @@
// -- axpy2v --
#define AXPY2V_KERNEL axpy2v_unb_var1
// -- dotaxpyv --
#define DOTAXPYV_KERNEL dotaxpyv_unb_var1
// -- axpyf --
#include "bli_axpyf_opt_var1.h"
#define AXPYF_KERNEL axpyf_opt_var1
#define BLIS_DAXPYF_KERNEL bli_daxpyf_opt_var1
// -- dotxf --
#define DOTXF_KERNEL dotxf_unb_var1
// -- dotxaxpyf --
#define DOTXAXPYF_KERNEL dotxaxpyf_unb_var1
@@ -298,52 +219,30 @@
// -- addv --
#define ADDV_KERNEL addv_unb_var1
// -- axpyv --
#include "bli_axpyv_opt_var1.h"
#define AXPYV_KERNEL axpyv_opt_var1
#define BLIS_DAXPYV_KERNEL bli_daxpyv_opt_var1
// -- copyv --
#define COPYV_KERNEL copyv_unb_var1
// -- dotv --
#include "bli_dotv_opt_var1.h"
#define DOTV_KERNEL dotv_opt_var1
#define BLIS_DDOTV_KERNEL bli_ddotv_opt_var1
// -- dotxv --
#define DOTXV_KERNEL dotxv_unb_var1
// -- invertv --
#define INVERTV_KERNEL invertv_unb_var1
// -- scal2v --
#define SCAL2V_KERNEL scal2v_unb_var1
// -- scalv --
#define SCALV_KERNEL scalv_unb_var1
// -- setv --
#define SETV_KERNEL setv_unb_var1
// -- subv --
#define SUBV_KERNEL subv_unb_var1
// -- swapv --
#define SWAPV_KERNEL swapv_unb_var1
#endif

View File

@@ -38,7 +38,7 @@
// -- LEVEL-3 MICRO-KERNEL CONSTANTS -------------------------------------------
// -- Default cache blocksizes --
// -- Cache blocksizes --
//
// Constraints:
@@ -70,35 +70,7 @@
#define BLIS_DEFAULT_KC_Z 128
#define BLIS_DEFAULT_NC_Z 4096
// -- Cache blocksize extensions (for optimizing edge cases) --
// NOTE: These cache blocksize "extensions" have the same constraints as
// the corresponding default blocksizes above. When these values are
// non-zero, blocksizes used at edge cases are extended (enlarged) if
// such an extension would encompass the remaining portion of the
// matrix dimension.
#define BLIS_EXTEND_MC_S 0 //(BLIS_DEFAULT_MC_S/4)
#define BLIS_EXTEND_KC_S 0 //(BLIS_DEFAULT_KC_S/4)
#define BLIS_EXTEND_NC_S 0 //(BLIS_DEFAULT_NC_S/4)
#define BLIS_EXTEND_MC_D 0 //(BLIS_DEFAULT_MC_D/4)
#define BLIS_EXTEND_KC_D 0 //(BLIS_DEFAULT_KC_D/4)
#define BLIS_EXTEND_NC_D 0 //(BLIS_DEFAULT_NC_D/4)
#define BLIS_EXTEND_MC_C 0 //(BLIS_DEFAULT_MC_C/4)
#define BLIS_EXTEND_KC_C 0 //(BLIS_DEFAULT_KC_C/4)
#define BLIS_EXTEND_NC_C 0 //(BLIS_DEFAULT_NC_C/4)
#define BLIS_EXTEND_MC_Z 0 //(BLIS_DEFAULT_MC_Z/4)
#define BLIS_EXTEND_KC_Z 0 //(BLIS_DEFAULT_KC_Z/4)
#define BLIS_EXTEND_NC_Z 0 //(BLIS_DEFAULT_NC_Z/4)
// -- Default register blocksizes for micro-kernel --
// NOTE: When using the reference configuration, these register blocksizes
// in the m and n dimensions should all be equal to the size expected by
// the reference micro-kernel(s).
// -- Register blocksizes --
#define BLIS_DEFAULT_MR_S 4
#define BLIS_DEFAULT_NR_S 4
@@ -116,10 +88,34 @@
// of f, handles leftover edge cases (ie: when k % f > 0) then these
// register blocksizes in the k dimension can be defined to 1.
#define BLIS_DEFAULT_KR_S 1
#define BLIS_DEFAULT_KR_D 1
#define BLIS_DEFAULT_KR_C 1
#define BLIS_DEFAULT_KR_Z 1
//#define BLIS_DEFAULT_KR_S 1
//#define BLIS_DEFAULT_KR_D 1
//#define BLIS_DEFAULT_KR_C 1
//#define BLIS_DEFAULT_KR_Z 1
// -- Cache blocksize extensions (for optimizing edge cases) --
// NOTE: These cache blocksize "extensions" have the same constraints as
// the corresponding default blocksizes above. When these values are
// non-zero, blocksizes used at edge cases are extended (enlarged) if
// such an extension would encompass the remaining portion of the
// matrix dimension.
//#define BLIS_EXTEND_MC_S 0 //(BLIS_DEFAULT_MC_S/4)
//#define BLIS_EXTEND_KC_S 0 //(BLIS_DEFAULT_KC_S/4)
//#define BLIS_EXTEND_NC_S 0 //(BLIS_DEFAULT_NC_S/4)
//#define BLIS_EXTEND_MC_D 0 //(BLIS_DEFAULT_MC_D/4)
//#define BLIS_EXTEND_KC_D 0 //(BLIS_DEFAULT_KC_D/4)
//#define BLIS_EXTEND_NC_D 0 //(BLIS_DEFAULT_NC_D/4)
//#define BLIS_EXTEND_MC_C 0 //(BLIS_DEFAULT_MC_C/4)
//#define BLIS_EXTEND_KC_C 0 //(BLIS_DEFAULT_KC_C/4)
//#define BLIS_EXTEND_NC_C 0 //(BLIS_DEFAULT_NC_C/4)
//#define BLIS_EXTEND_MC_Z 0 //(BLIS_DEFAULT_MC_Z/4)
//#define BLIS_EXTEND_KC_Z 0 //(BLIS_DEFAULT_KC_Z/4)
//#define BLIS_EXTEND_NC_Z 0 //(BLIS_DEFAULT_NC_Z/4)
// -- Register blocksize extensions (for packed micro-panels) --
@@ -127,93 +123,27 @@
// leading dimensions used within the packed micro-panels are equal to
// or greater than their corresponding register blocksizes above.
#define BLIS_EXTEND_MR_S 0
#define BLIS_EXTEND_NR_S 0
//#define BLIS_EXTEND_MR_S 0
//#define BLIS_EXTEND_NR_S 0
#define BLIS_EXTEND_MR_D 0
#define BLIS_EXTEND_NR_D 0
//#define BLIS_EXTEND_MR_D 0
//#define BLIS_EXTEND_NR_D 0
#define BLIS_EXTEND_MR_C 0
#define BLIS_EXTEND_NR_C 0
//#define BLIS_EXTEND_MR_C 0
//#define BLIS_EXTEND_NR_C 0
#define BLIS_EXTEND_MR_Z 0
#define BLIS_EXTEND_NR_Z 0
// Register blocksize extensions in the k dimension are not used.
#define BLIS_EXTEND_KR_S 0
#define BLIS_EXTEND_KR_D 0
#define BLIS_EXTEND_KR_C 0
#define BLIS_EXTEND_KR_Z 0
//#define BLIS_EXTEND_MR_Z 0
//#define BLIS_EXTEND_NR_Z 0
// -- LEVEL-2 KERNEL CONSTANTS -------------------------------------------------
// NOTE: These values determine high-level cache blocking for level-2
// operations ONLY. So, if gemv is performed with a 2000x2000 matrix A and
// MC = NC = 1000, then a total of four unblocked (or unblocked fused)
// gemv subproblems are called. The blocked algorithms are only useful in
// that they provide the opportunity for packing vectors. (Matrices can also
// be packed here, but this tends to be much too expensive in practice to
// actually employ.)
#define BLIS_DEFAULT_L2_MC_S 1000
#define BLIS_DEFAULT_L2_NC_S 1000
#define BLIS_DEFAULT_L2_MC_D 1000
#define BLIS_DEFAULT_L2_NC_D 1000
#define BLIS_DEFAULT_L2_MC_C 1000
#define BLIS_DEFAULT_L2_NC_C 1000
#define BLIS_DEFAULT_L2_MC_Z 1000
#define BLIS_DEFAULT_L2_NC_Z 1000
// -- LEVEL-1F KERNEL CONSTANTS ------------------------------------------------
// -- Default fusing factors for level-1f operations --
// NOTE: Default fusing factors are not used by the reference implementations
// of level-1f operations. They are here only for use when these operations
// are optimized.
#define BLIS_DEFAULT_FUSE_FAC_S 8
#define BLIS_DEFAULT_FUSE_FAC_D 4
#define BLIS_DEFAULT_FUSE_FAC_C 4
#define BLIS_DEFAULT_FUSE_FAC_Z 2
#define BLIS_AXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
#define BLIS_AXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
#define BLIS_AXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
#define BLIS_AXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
#define BLIS_DOTXF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
#define BLIS_DOTXF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
#define BLIS_DOTXF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
#define BLIS_DOTXF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
#define BLIS_DOTXAXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
#define BLIS_DOTXAXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
#define BLIS_DOTXAXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
#define BLIS_DOTXAXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
// -- LEVEL-1V KERNEL CONSTANTS ------------------------------------------------
// -- Default register blocksizes for vectors --
// NOTE: Register blocksizes for vectors are used when packing
// non-contiguous vectors. Similar to that of KR, they can
// typically be set to 1.
#define BLIS_DEFAULT_VR_S 1
#define BLIS_DEFAULT_VR_D 1
#define BLIS_DEFAULT_VR_C 1
#define BLIS_DEFAULT_VR_Z 1
@@ -221,16 +151,11 @@
// -- gemm --
#include "bli_gemm_opt_4x4.h"
#define GEMM_UKERNEL gemm_opt_4x4
#define BLIS_SGEMM_UKERNEL bli_sgemm_opt_4x4
#define BLIS_DGEMM_UKERNEL bli_dgemm_opt_4x4
// -- trsm-related --
#define GEMMTRSM_L_UKERNEL gemmtrsm_l_ref_mxn
#define GEMMTRSM_U_UKERNEL gemmtrsm_u_ref_mxn
#define TRSM_L_UKERNEL trsm_l_ref_mxn
#define TRSM_U_UKERNEL trsm_u_ref_mxn
@@ -238,25 +163,8 @@
// -- packm --
#define PACKM_2XK_KERNEL packm_ref_2xk
#define PACKM_4XK_KERNEL packm_ref_4xk
#define PACKM_6XK_KERNEL packm_ref_6xk
#define PACKM_8XK_KERNEL packm_ref_8xk
#define PACKM_10XK_KERNEL packm_ref_10xk
#define PACKM_12XK_KERNEL packm_ref_12xk
#define PACKM_14XK_KERNEL packm_ref_14xk
#define PACKM_16XK_KERNEL packm_ref_16xk
// -- unpackm --
#define UNPACKM_2XK_KERNEL unpackm_ref_2xk
#define UNPACKM_4XK_KERNEL unpackm_ref_4xk
#define UNPACKM_6XK_KERNEL unpackm_ref_6xk
#define UNPACKM_8XK_KERNEL unpackm_ref_8xk
#define UNPACKM_10XK_KERNEL unpackm_ref_10xk
#define UNPACKM_12XK_KERNEL unpackm_ref_12xk
#define UNPACKM_14XK_KERNEL unpackm_ref_14xk
#define UNPACKM_16XK_KERNEL unpackm_ref_16xk
@@ -264,23 +172,14 @@
// -- axpy2v --
#define AXPY2V_KERNEL axpy2v_unb_var1
// -- dotaxpyv --
#define DOTAXPYV_KERNEL dotaxpyv_unb_var1
// -- axpyf --
#define AXPYF_KERNEL axpyf_unb_var1
// -- dotxf --
#define DOTXF_KERNEL dotxf_unb_var1
// -- dotxaxpyf --
#define DOTXAXPYF_KERNEL dotxaxpyf_unb_var1
@@ -288,48 +187,26 @@
// -- addv --
#define ADDV_KERNEL addv_unb_var1
// -- axpyv --
#define AXPYV_KERNEL axpyv_unb_var1
// -- copyv --
#define COPYV_KERNEL copyv_unb_var1
// -- dotv --
#define DOTV_KERNEL dotv_unb_var1
// -- dotxv --
#define DOTXV_KERNEL dotxv_unb_var1
// -- invertv --
#define INVERTV_KERNEL invertv_unb_var1
// -- scal2v --
#define SCAL2V_KERNEL scal2v_unb_var1
// -- scalv --
#define SCALV_KERNEL scalv_unb_var1
// -- setv --
#define SETV_KERNEL setv_unb_var1
// -- subv --
#define SUBV_KERNEL subv_unb_var1
// -- swapv --
#define SWAPV_KERNEL swapv_unb_var1
#endif

View File

@@ -38,7 +38,7 @@
// -- LEVEL-3 MICRO-KERNEL CONSTANTS -------------------------------------------
// -- Default cache blocksizes --
// -- Cache blocksizes --
//
// Constraints:
@@ -70,35 +70,7 @@
#define BLIS_DEFAULT_KC_Z 128
#define BLIS_DEFAULT_NC_Z 4096
// -- Cache blocksize extensions (for optimizing edge cases) --
// NOTE: These cache blocksize "extensions" have the same constraints as
// the corresponding default blocksizes above. When these values are
// non-zero, blocksizes used at edge cases are extended (enlarged) if
// such an extension would encompass the remaining portion of the
// matrix dimension.
#define BLIS_EXTEND_MC_S 0 //(BLIS_DEFAULT_MC_S/4)
#define BLIS_EXTEND_KC_S 0 //(BLIS_DEFAULT_KC_S/4)
#define BLIS_EXTEND_NC_S 0 //(BLIS_DEFAULT_NC_S/4)
#define BLIS_EXTEND_MC_D 0 //(BLIS_DEFAULT_MC_D/4)
#define BLIS_EXTEND_KC_D 0 //(BLIS_DEFAULT_KC_D/4)
#define BLIS_EXTEND_NC_D 0 //(BLIS_DEFAULT_NC_D/4)
#define BLIS_EXTEND_MC_C 0 //(BLIS_DEFAULT_MC_C/4)
#define BLIS_EXTEND_KC_C 0 //(BLIS_DEFAULT_KC_C/4)
#define BLIS_EXTEND_NC_C 0 //(BLIS_DEFAULT_NC_C/4)
#define BLIS_EXTEND_MC_Z 0 //(BLIS_DEFAULT_MC_Z/4)
#define BLIS_EXTEND_KC_Z 0 //(BLIS_DEFAULT_KC_Z/4)
#define BLIS_EXTEND_NC_Z 0 //(BLIS_DEFAULT_NC_Z/4)
// -- Default register blocksizes for micro-kernel --
// NOTE: When using the reference configuration, these register blocksizes
// in the m and n dimensions should all be equal to the size expected by
// the reference micro-kernel(s).
// -- Register blocksizes --
#define BLIS_DEFAULT_MR_S 4
#define BLIS_DEFAULT_NR_S 4
@@ -116,10 +88,34 @@
// of f, handles leftover edge cases (ie: when k % f > 0) then these
// register blocksizes in the k dimension can be defined to 1.
#define BLIS_DEFAULT_KR_S 1
#define BLIS_DEFAULT_KR_D 1
#define BLIS_DEFAULT_KR_C 1
#define BLIS_DEFAULT_KR_Z 1
//#define BLIS_DEFAULT_KR_S 1
//#define BLIS_DEFAULT_KR_D 1
//#define BLIS_DEFAULT_KR_C 1
//#define BLIS_DEFAULT_KR_Z 1
// -- Cache blocksize extensions (for optimizing edge cases) --
// NOTE: These cache blocksize "extensions" have the same constraints as
// the corresponding default blocksizes above. When these values are
// non-zero, blocksizes used at edge cases are extended (enlarged) if
// such an extension would encompass the remaining portion of the
// matrix dimension.
//#define BLIS_EXTEND_MC_S 0 //(BLIS_DEFAULT_MC_S/4)
//#define BLIS_EXTEND_KC_S 0 //(BLIS_DEFAULT_KC_S/4)
//#define BLIS_EXTEND_NC_S 0 //(BLIS_DEFAULT_NC_S/4)
//#define BLIS_EXTEND_MC_D 0 //(BLIS_DEFAULT_MC_D/4)
//#define BLIS_EXTEND_KC_D 0 //(BLIS_DEFAULT_KC_D/4)
//#define BLIS_EXTEND_NC_D 0 //(BLIS_DEFAULT_NC_D/4)
//#define BLIS_EXTEND_MC_C 0 //(BLIS_DEFAULT_MC_C/4)
//#define BLIS_EXTEND_KC_C 0 //(BLIS_DEFAULT_KC_C/4)
//#define BLIS_EXTEND_NC_C 0 //(BLIS_DEFAULT_NC_C/4)
//#define BLIS_EXTEND_MC_Z 0 //(BLIS_DEFAULT_MC_Z/4)
//#define BLIS_EXTEND_KC_Z 0 //(BLIS_DEFAULT_KC_Z/4)
//#define BLIS_EXTEND_NC_Z 0 //(BLIS_DEFAULT_NC_Z/4)
// -- Register blocksize extensions (for packed micro-panels) --
@@ -127,93 +123,27 @@
// leading dimensions used within the packed micro-panels are equal to
// or greater than their corresponding register blocksizes above.
#define BLIS_EXTEND_MR_S 0
#define BLIS_EXTEND_NR_S 0
//#define BLIS_EXTEND_MR_S 0
//#define BLIS_EXTEND_NR_S 0
#define BLIS_EXTEND_MR_D 0
#define BLIS_EXTEND_NR_D 0
//#define BLIS_EXTEND_MR_D 0
//#define BLIS_EXTEND_NR_D 0
#define BLIS_EXTEND_MR_C 0
#define BLIS_EXTEND_NR_C 0
//#define BLIS_EXTEND_MR_C 0
//#define BLIS_EXTEND_NR_C 0
#define BLIS_EXTEND_MR_Z 0
#define BLIS_EXTEND_NR_Z 0
// Register blocksize extensions in the k dimension are not used.
#define BLIS_EXTEND_KR_S 0
#define BLIS_EXTEND_KR_D 0
#define BLIS_EXTEND_KR_C 0
#define BLIS_EXTEND_KR_Z 0
//#define BLIS_EXTEND_MR_Z 0
//#define BLIS_EXTEND_NR_Z 0
// -- LEVEL-2 KERNEL CONSTANTS -------------------------------------------------
// NOTE: These values determine high-level cache blocking for level-2
// operations ONLY. So, if gemv is performed with a 2000x2000 matrix A and
// MC = NC = 1000, then a total of four unblocked (or unblocked fused)
// gemv subproblems are called. The blocked algorithms are only useful in
// that they provide the opportunity for packing vectors. (Matrices can also
// be packed here, but this tends to be much too expensive in practice to
// actually employ.)
#define BLIS_DEFAULT_L2_MC_S 1000
#define BLIS_DEFAULT_L2_NC_S 1000
#define BLIS_DEFAULT_L2_MC_D 1000
#define BLIS_DEFAULT_L2_NC_D 1000
#define BLIS_DEFAULT_L2_MC_C 1000
#define BLIS_DEFAULT_L2_NC_C 1000
#define BLIS_DEFAULT_L2_MC_Z 1000
#define BLIS_DEFAULT_L2_NC_Z 1000
// -- LEVEL-1F KERNEL CONSTANTS ------------------------------------------------
// -- Default fusing factors for level-1f operations --
// NOTE: Default fusing factors are not used by the reference implementations
// of level-1f operations. They are here only for use when these operations
// are optimized.
#define BLIS_DEFAULT_FUSE_FAC_S 8
#define BLIS_DEFAULT_FUSE_FAC_D 4
#define BLIS_DEFAULT_FUSE_FAC_C 4
#define BLIS_DEFAULT_FUSE_FAC_Z 2
#define BLIS_AXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
#define BLIS_AXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
#define BLIS_AXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
#define BLIS_AXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
#define BLIS_DOTXF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
#define BLIS_DOTXF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
#define BLIS_DOTXF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
#define BLIS_DOTXF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
#define BLIS_DOTXAXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
#define BLIS_DOTXAXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
#define BLIS_DOTXAXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
#define BLIS_DOTXAXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
// -- LEVEL-1V KERNEL CONSTANTS ------------------------------------------------
// -- Default register blocksizes for vectors --
// NOTE: Register blocksizes for vectors are used when packing
// non-contiguous vectors. Similar to that of KR, they can
// typically be set to 1.
#define BLIS_DEFAULT_VR_S 1
#define BLIS_DEFAULT_VR_D 1
#define BLIS_DEFAULT_VR_C 1
#define BLIS_DEFAULT_VR_Z 1
@@ -221,16 +151,11 @@
// -- gemm --
#include "bli_gemm_opt_4x4.h"
#define GEMM_UKERNEL gemm_opt_4x4
#define BLIS_SGEMM_UKERNEL bli_sgemm_opt_4x4
#define BLIS_DGEMM_UKERNEL bli_dgemm_opt_4x4
// -- trsm-related --
#define GEMMTRSM_L_UKERNEL gemmtrsm_l_ref_mxn
#define GEMMTRSM_U_UKERNEL gemmtrsm_u_ref_mxn
#define TRSM_L_UKERNEL trsm_l_ref_mxn
#define TRSM_U_UKERNEL trsm_u_ref_mxn
@@ -238,25 +163,8 @@
// -- packm --
#define PACKM_2XK_KERNEL packm_ref_2xk
#define PACKM_4XK_KERNEL packm_ref_4xk
#define PACKM_6XK_KERNEL packm_ref_6xk
#define PACKM_8XK_KERNEL packm_ref_8xk
#define PACKM_10XK_KERNEL packm_ref_10xk
#define PACKM_12XK_KERNEL packm_ref_12xk
#define PACKM_14XK_KERNEL packm_ref_14xk
#define PACKM_16XK_KERNEL packm_ref_16xk
// -- unpackm --
#define UNPACKM_2XK_KERNEL unpackm_ref_2xk
#define UNPACKM_4XK_KERNEL unpackm_ref_4xk
#define UNPACKM_6XK_KERNEL unpackm_ref_6xk
#define UNPACKM_8XK_KERNEL unpackm_ref_8xk
#define UNPACKM_10XK_KERNEL unpackm_ref_10xk
#define UNPACKM_12XK_KERNEL unpackm_ref_12xk
#define UNPACKM_14XK_KERNEL unpackm_ref_14xk
#define UNPACKM_16XK_KERNEL unpackm_ref_16xk
@@ -264,23 +172,14 @@
// -- axpy2v --
#define AXPY2V_KERNEL axpy2v_unb_var1
// -- dotaxpyv --
#define DOTAXPYV_KERNEL dotaxpyv_unb_var1
// -- axpyf --
#define AXPYF_KERNEL axpyf_unb_var1
// -- dotxf --
#define DOTXF_KERNEL dotxf_unb_var1
// -- dotxaxpyf --
#define DOTXAXPYF_KERNEL dotxaxpyf_unb_var1
@@ -288,48 +187,26 @@
// -- addv --
#define ADDV_KERNEL addv_unb_var1
// -- axpyv --
#define AXPYV_KERNEL axpyv_unb_var1
// -- copyv --
#define COPYV_KERNEL copyv_unb_var1
// -- dotv --
#define DOTV_KERNEL dotv_unb_var1
// -- dotxv --
#define DOTXV_KERNEL dotxv_unb_var1
// -- invertv --
#define INVERTV_KERNEL invertv_unb_var1
// -- scal2v --
#define SCAL2V_KERNEL scal2v_unb_var1
// -- scalv --
#define SCALV_KERNEL scalv_unb_var1
// -- setv --
#define SETV_KERNEL setv_unb_var1
// -- subv --
#define SUBV_KERNEL subv_unb_var1
// -- swapv --
#define SWAPV_KERNEL swapv_unb_var1
#endif

View File

@@ -64,14 +64,6 @@
// scomplex and dcomplex being defined in terms of simple structs.
//#define BLIS_ENABLE_C99_COMPLEX
// Enable use of the 4m method to implement complex domain level-3
// operations. By enabling this option, special code is activiated that
// induces complex level-3 operations using ONLY the real domain
// micro-kernels. This allows kernel authors to focus on optimizing
// the real micro-kernels, which can then be leveraged to provide their
// complex counterparts "for free".
#define BLIS_ENABLE_COMPLEX_VIA_4M
// -- MULTITHREADING -----------------------------------------------------------

View File

@@ -38,7 +38,7 @@
// -- LEVEL-3 MICRO-KERNEL CONSTANTS -------------------------------------------
// -- Default cache blocksizes --
// -- Cache blocksizes --
//
// Constraints:
@@ -62,13 +62,13 @@
#define BLIS_DEFAULT_KC_D 384
#define BLIS_DEFAULT_NC_D 4096
#define BLIS_DEFAULT_MC_C 384
#define BLIS_DEFAULT_KC_C 384
#define BLIS_DEFAULT_NC_C 4096
//#define BLIS_DEFAULT_MC_C 384
//#define BLIS_DEFAULT_KC_C 384
//#define BLIS_DEFAULT_NC_C 4096
#define BLIS_DEFAULT_MC_Z 192
#define BLIS_DEFAULT_KC_Z 384
#define BLIS_DEFAULT_NC_Z 4096
//#define BLIS_DEFAULT_MC_Z 192
//#define BLIS_DEFAULT_KC_Z 384
//#define BLIS_DEFAULT_NC_Z 4096
// NOTE: If 4m blocksizes are not defined here, they will be determined
// from the corresponding real domain blocksizes.
@@ -90,35 +90,7 @@
#define BLIS_DEFAULT_3M_KC_Z 256
#define BLIS_DEFAULT_3M_NC_Z 4096
// -- Cache blocksize extensions (for optimizing edge cases) --
// NOTE: These cache blocksize "extensions" have the same constraints as
// the corresponding default blocksizes above. When these values are
// non-zero, blocksizes used at edge cases are extended (enlarged) if
// such an extension would encompass the remaining portion of the
// matrix dimension.
#define BLIS_EXTEND_MC_S 0 //(BLIS_DEFAULT_MC_S/4)
#define BLIS_EXTEND_KC_S 0 //(BLIS_DEFAULT_KC_S/4)
#define BLIS_EXTEND_NC_S 0 //(BLIS_DEFAULT_NC_S/4)
#define BLIS_EXTEND_MC_D (BLIS_DEFAULT_MC_D/4)
#define BLIS_EXTEND_KC_D (BLIS_DEFAULT_KC_D/4)
#define BLIS_EXTEND_NC_D 0 //(BLIS_DEFAULT_NC_D/4)
#define BLIS_EXTEND_MC_C 0 //(BLIS_DEFAULT_MC_C/4)
#define BLIS_EXTEND_KC_C 0 //(BLIS_DEFAULT_KC_C/4)
#define BLIS_EXTEND_NC_C 0 //(BLIS_DEFAULT_NC_C/4)
#define BLIS_EXTEND_MC_Z 0 //(BLIS_DEFAULT_MC_Z/4)
#define BLIS_EXTEND_KC_Z 0 //(BLIS_DEFAULT_KC_Z/4)
#define BLIS_EXTEND_NC_Z 0 //(BLIS_DEFAULT_NC_Z/4)
// -- Default register blocksizes for micro-kernel --
// NOTE: When using the reference configuration, these register blocksizes
// in the m and n dimensions should all be equal to the size expected by
// the reference micro-kernel(s).
// -- Register blocksizes --
#define BLIS_DEFAULT_MR_S 8
#define BLIS_DEFAULT_NR_S 4
@@ -136,10 +108,34 @@
// of f, handles leftover edge cases (ie: when k % f > 0) then these
// register blocksizes in the k dimension can be defined to 1.
#define BLIS_DEFAULT_KR_S 1
#define BLIS_DEFAULT_KR_D 1
#define BLIS_DEFAULT_KR_C 1
#define BLIS_DEFAULT_KR_Z 1
//#define BLIS_DEFAULT_KR_S 1
//#define BLIS_DEFAULT_KR_D 1
//#define BLIS_DEFAULT_KR_C 1
//#define BLIS_DEFAULT_KR_Z 1
// -- Cache blocksize extensions (for optimizing edge cases) --
// NOTE: These cache blocksize "extensions" have the same constraints as
// the corresponding default blocksizes above. When these values are
// non-zero, blocksizes used at edge cases are extended (enlarged) if
// such an extension would encompass the remaining portion of the
// matrix dimension.
//#define BLIS_EXTEND_MC_S 0 //(BLIS_DEFAULT_MC_S/4)
//#define BLIS_EXTEND_KC_S 0 //(BLIS_DEFAULT_KC_S/4)
//#define BLIS_EXTEND_NC_S 0 //(BLIS_DEFAULT_NC_S/4)
//#define BLIS_EXTEND_MC_D (BLIS_DEFAULT_MC_D/4)
//#define BLIS_EXTEND_KC_D (BLIS_DEFAULT_KC_D/4)
//#define BLIS_EXTEND_NC_D 0 //(BLIS_DEFAULT_NC_D/4)
//#define BLIS_EXTEND_MC_C 0 //(BLIS_DEFAULT_MC_C/4)
//#define BLIS_EXTEND_KC_C 0 //(BLIS_DEFAULT_KC_C/4)
//#define BLIS_EXTEND_NC_C 0 //(BLIS_DEFAULT_NC_C/4)
//#define BLIS_EXTEND_MC_Z 0 //(BLIS_DEFAULT_MC_Z/4)
//#define BLIS_EXTEND_KC_Z 0 //(BLIS_DEFAULT_KC_Z/4)
//#define BLIS_EXTEND_NC_Z 0 //(BLIS_DEFAULT_NC_Z/4)
// -- Register blocksize extensions (for packed micro-panels) --
@@ -147,234 +143,99 @@
// leading dimensions used within the packed micro-panels are equal to
// or greater than their corresponding register blocksizes above.
#define BLIS_EXTEND_MR_S 0
#define BLIS_EXTEND_NR_S 0
//#define BLIS_EXTEND_MR_S 0
//#define BLIS_EXTEND_NR_S 0
#define BLIS_EXTEND_MR_D 0
#define BLIS_EXTEND_NR_D 0
//#define BLIS_EXTEND_MR_D 0
//#define BLIS_EXTEND_NR_D 0
#define BLIS_EXTEND_MR_C 0
#define BLIS_EXTEND_NR_C 0
//#define BLIS_EXTEND_MR_C 0
//#define BLIS_EXTEND_NR_C 0
#define BLIS_EXTEND_MR_Z 0
#define BLIS_EXTEND_NR_Z 0
//#define BLIS_EXTEND_MR_Z 0
//#define BLIS_EXTEND_NR_Z 0
// Register blocksize extensions in the k dimension are not used.
#define BLIS_EXTEND_KR_S 0
#define BLIS_EXTEND_KR_D 0
#define BLIS_EXTEND_KR_C 0
#define BLIS_EXTEND_KR_Z 0
// -- LEVEL-2 KERNEL CONSTANTS -------------------------------------------------
// NOTE: These values determine high-level cache blocking for level-2
// operations ONLY. So, if gemv is performed with a 2000x2000 matrix A and
// MC = NC = 1000, then a total of four unblocked (or unblocked fused)
// gemv subproblems are called. The blocked algorithms are only useful in
// that they provide the opportunity for packing vectors. (Matrices can also
// be packed here, but this tends to be much too expensive in practice to
// actually employ.)
#define BLIS_DEFAULT_L2_MC_S 1000
#define BLIS_DEFAULT_L2_NC_S 1000
#define BLIS_DEFAULT_L2_MC_D 1000
#define BLIS_DEFAULT_L2_NC_D 1000
#define BLIS_DEFAULT_L2_MC_C 1000
#define BLIS_DEFAULT_L2_NC_C 1000
#define BLIS_DEFAULT_L2_MC_Z 1000
#define BLIS_DEFAULT_L2_NC_Z 1000
// -- LEVEL-1F KERNEL CONSTANTS ------------------------------------------------
// -- Default fusing factors for level-1f operations --
// NOTE: Default fusing factors are not used by the reference implementations
// of level-1f operations. They are here only for use when these operations
// are optimized.
#define BLIS_DEFAULT_FUSE_FAC_S 8
#define BLIS_DEFAULT_FUSE_FAC_D 4
#define BLIS_DEFAULT_FUSE_FAC_C 4
#define BLIS_DEFAULT_FUSE_FAC_Z 2
#define BLIS_AXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
#define BLIS_AXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
#define BLIS_AXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
#define BLIS_AXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
#define BLIS_DOTXF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
#define BLIS_DOTXF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
#define BLIS_DOTXF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
#define BLIS_DOTXF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
#define BLIS_DOTXAXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
#define BLIS_DOTXAXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
#define BLIS_DOTXAXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
#define BLIS_DOTXAXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
// -- LEVEL-1V KERNEL CONSTANTS ------------------------------------------------
// -- Default register blocksizes for vectors --
// NOTE: Register blocksizes for vectors are used when packing
// non-contiguous vectors. Similar to that of KR, they can
// typically be set to 1.
#define BLIS_DEFAULT_VR_S 1
#define BLIS_DEFAULT_VR_D 1
#define BLIS_DEFAULT_VR_C 1
#define BLIS_DEFAULT_VR_Z 1
// -- LEVEL-3 KERNEL DEFINITIONS -----------------------------------------------
#include "bli_gemm_opt_d4x4.h"
#include "bli_gemmtrsm_l_opt_d4x4.h"
#include "bli_gemmtrsm_u_opt_d4x4.h"
//#include "bli_gemmtrsm_l_ref_mxn.h"
//#include "bli_gemmtrsm_u_ref_mxn.h"
//#include "bli_trsm_l_ref_4x4.h"
//#include "bli_trsm_u_ref_4x4.h"
#include "bli_trsm_l_ref_mxn.h"
#include "bli_trsm_u_ref_mxn.h"
// -- gemm --
#define GEMM_UKERNEL gemm_opt_d4x4
#define BLIS_SGEMM_UKERNEL bli_sgemm_opt_8x4
#define BLIS_DGEMM_UKERNEL bli_dgemm_opt_4x4
// -- trsm-related --
//#define GEMMTRSM_L_UKERNEL gemmtrsm_l_ref_mxn
//#define GEMMTRSM_U_UKERNEL gemmtrsm_u_ref_mxn
#define GEMMTRSM_L_UKERNEL gemmtrsm_l_opt_d4x4
#define GEMMTRSM_U_UKERNEL gemmtrsm_u_opt_d4x4
#define BLIS_DGEMMTRSM_L_UKERNEL bli_dgemmtrsm_l_opt_4x4
#define BLIS_DGEMMTRSM_U_UKERNEL bli_dgemmtrsm_u_opt_4x4
//#define TRSM_L_UKERNEL trsm_l_ref_4x4
//#define TRSM_U_UKERNEL trsm_u_ref_4x4
#define TRSM_L_UKERNEL trsm_l_ref_mxn
#define TRSM_U_UKERNEL trsm_u_ref_mxn
// -- LEVEL-1M KERNEL DEFINITIONS ----------------------------------------------
// -- packm --
#define PACKM_2XK_KERNEL packm_ref_2xk
#define PACKM_4XK_KERNEL packm_ref_4xk
#define PACKM_6XK_KERNEL packm_ref_6xk
#define PACKM_8XK_KERNEL packm_ref_8xk
#define PACKM_10XK_KERNEL packm_ref_10xk
#define PACKM_12XK_KERNEL packm_ref_12xk
#define PACKM_14XK_KERNEL packm_ref_14xk
#define PACKM_16XK_KERNEL packm_ref_16xk
// -- unpackm --
#define UNPACKM_2XK_KERNEL unpackm_ref_2xk
#define UNPACKM_4XK_KERNEL unpackm_ref_4xk
#define UNPACKM_6XK_KERNEL unpackm_ref_6xk
#define UNPACKM_8XK_KERNEL unpackm_ref_8xk
#define UNPACKM_10XK_KERNEL unpackm_ref_10xk
#define UNPACKM_12XK_KERNEL unpackm_ref_12xk
#define UNPACKM_14XK_KERNEL unpackm_ref_14xk
#define UNPACKM_16XK_KERNEL unpackm_ref_16xk
// -- LEVEL-1F KERNEL DEFINITIONS ----------------------------------------------
#include "bli_axpy2v_opt_var1.h"
#include "bli_dotaxpyv_opt_var1.h"
#include "bli_axpyf_opt_var1.h"
#include "bli_dotxf_opt_var1.h"
#include "bli_dotxaxpyf_opt_var1.h"
// -- axpy2v --
#define AXPY2V_KERNEL axpy2v_opt_var1
#define BLIS_DAXPY2V_KERNEL bli_daxpy2v_opt_var1
// -- dotaxpyv --
#define DOTAXPYV_KERNEL dotaxpyv_opt_var1
#define BLIS_DDOTAXPYV_KERNEL bli_ddotaxpyv_opt_var1
// -- axpyf --
#define AXPYF_KERNEL axpyf_opt_var1
#define BLIS_DAXPYF_KERNEL bli_daxpyf_opt_var1
// -- dotxf --
#define DOTXF_KERNEL dotxf_opt_var1
#define BLIS_DDOTXF_KERNEL bli_ddotxf_opt_var1
// -- dotxaxpyf --
#define DOTXAXPYF_KERNEL dotxaxpyf_opt_var1
#define BLIS_DDOTXAXPYF_KERNEL bli_ddotxaxpyf_opt_var1
// -- LEVEL-1V KERNEL DEFINITIONS ----------------------------------------------
#include "bli_axpyv_opt_var1.h"
#include "bli_dotv_opt_var1.h"
// -- addv --
#define ADDV_KERNEL addv_unb_var1
// -- axpyv --
#define AXPYV_KERNEL axpyv_opt_var1
#define BLIS_DAXPYV_KERNEL bli_daxpyv_opt_var1
// -- copyv --
#define COPYV_KERNEL copyv_unb_var1
// -- dotv --
#define DOTV_KERNEL dotv_opt_var1
#define BLIS_DDOTV_KERNEL bli_ddotv_opt_var1
// -- dotxv --
#define DOTXV_KERNEL dotxv_unb_var1
// -- invertv --
#define INVERTV_KERNEL invertv_unb_var1
// -- scal2v --
#define SCAL2V_KERNEL scal2v_unb_var1
// -- scalv --
#define SCALV_KERNEL scalv_unb_var1
// -- setv --
#define SETV_KERNEL setv_unb_var1
// -- subv --
#define SUBV_KERNEL subv_unb_var1
// -- swapv --
#define SWAPV_KERNEL swapv_unb_var1
#endif

View File

@@ -38,7 +38,7 @@
// -- LEVEL-3 MICRO-KERNEL CONSTANTS -------------------------------------------
// -- Default cache blocksizes --
// -- Cache blocksizes --
//
// Constraints:
@@ -70,35 +70,7 @@
#define BLIS_DEFAULT_KC_Z 256
#define BLIS_DEFAULT_NC_Z 2048
// -- Cache blocksize extensions (for optimizing edge cases) --
// NOTE: These cache blocksize "extensions" have the same constraints as
// the corresponding default blocksizes above. When these values are
// non-zero, blocksizes used at edge cases are extended (enlarged) if
// such an extension would encompass the remaining portion of the
// matrix dimension.
#define BLIS_EXTEND_MC_S 0 //(BLIS_DEFAULT_MC_S/4)
#define BLIS_EXTEND_KC_S 0 //(BLIS_DEFAULT_KC_S/4)
#define BLIS_EXTEND_NC_S 0 //(BLIS_DEFAULT_NC_S/4)
#define BLIS_EXTEND_MC_D 0 //(BLIS_DEFAULT_MC_D/4)
#define BLIS_EXTEND_KC_D 0 //(BLIS_DEFAULT_KC_D/4)
#define BLIS_EXTEND_NC_D 0 //(BLIS_DEFAULT_NC_D/4)
#define BLIS_EXTEND_MC_C 0 //(BLIS_DEFAULT_MC_C/4)
#define BLIS_EXTEND_KC_C 0 //(BLIS_DEFAULT_KC_C/4)
#define BLIS_EXTEND_NC_C 0 //(BLIS_DEFAULT_NC_C/4)
#define BLIS_EXTEND_MC_Z 0 //(BLIS_DEFAULT_MC_Z/4)
#define BLIS_EXTEND_KC_Z 0 //(BLIS_DEFAULT_KC_Z/4)
#define BLIS_EXTEND_NC_Z 0 //(BLIS_DEFAULT_NC_Z/4)
// -- Default register blocksizes for micro-kernel --
// NOTE: When using the reference configuration, these register blocksizes
// in the m and n dimensions should all be equal to the size expected by
// the reference micro-kernel(s).
// -- Register blocksizes --
#define BLIS_DEFAULT_MR_S 8
#define BLIS_DEFAULT_NR_S 4
@@ -116,10 +88,34 @@
// of f, handles leftover edge cases (ie: when k % f > 0) then these
// register blocksizes in the k dimension can be defined to 1.
#define BLIS_DEFAULT_KR_S 1
#define BLIS_DEFAULT_KR_D 1
#define BLIS_DEFAULT_KR_C 1
#define BLIS_DEFAULT_KR_Z 1
//#define BLIS_DEFAULT_KR_S 1
//#define BLIS_DEFAULT_KR_D 1
//#define BLIS_DEFAULT_KR_C 1
//#define BLIS_DEFAULT_KR_Z 1
// -- Cache blocksize extensions (for optimizing edge cases) --
// NOTE: These cache blocksize "extensions" have the same constraints as
// the corresponding default blocksizes above. When these values are
// non-zero, blocksizes used at edge cases are extended (enlarged) if
// such an extension would encompass the remaining portion of the
// matrix dimension.
//#define BLIS_EXTEND_MC_S 0 //(BLIS_DEFAULT_MC_S/4)
//#define BLIS_EXTEND_KC_S 0 //(BLIS_DEFAULT_KC_S/4)
//#define BLIS_EXTEND_NC_S 0 //(BLIS_DEFAULT_NC_S/4)
//#define BLIS_EXTEND_MC_D 0 //(BLIS_DEFAULT_MC_D/4)
//#define BLIS_EXTEND_KC_D 0 //(BLIS_DEFAULT_KC_D/4)
//#define BLIS_EXTEND_NC_D 0 //(BLIS_DEFAULT_NC_D/4)
//#define BLIS_EXTEND_MC_C 0 //(BLIS_DEFAULT_MC_C/4)
//#define BLIS_EXTEND_KC_C 0 //(BLIS_DEFAULT_KC_C/4)
//#define BLIS_EXTEND_NC_C 0 //(BLIS_DEFAULT_NC_C/4)
//#define BLIS_EXTEND_MC_Z 0 //(BLIS_DEFAULT_MC_Z/4)
//#define BLIS_EXTEND_KC_Z 0 //(BLIS_DEFAULT_KC_Z/4)
//#define BLIS_EXTEND_NC_Z 0 //(BLIS_DEFAULT_NC_Z/4)
// -- Register blocksize extensions (for packed micro-panels) --
@@ -127,111 +123,39 @@
// leading dimensions used within the packed micro-panels are equal to
// or greater than their corresponding register blocksizes above.
#define BLIS_EXTEND_MR_S 0
#define BLIS_EXTEND_NR_S 0
//#define BLIS_EXTEND_MR_S 0
//#define BLIS_EXTEND_NR_S 0
#define BLIS_EXTEND_MR_D 0
#define BLIS_EXTEND_NR_D 0
//#define BLIS_EXTEND_MR_D 0
//#define BLIS_EXTEND_NR_D 0
#define BLIS_EXTEND_MR_C 0
#define BLIS_EXTEND_NR_C 0
//#define BLIS_EXTEND_MR_C 0
//#define BLIS_EXTEND_NR_C 0
#define BLIS_EXTEND_MR_Z 0
#define BLIS_EXTEND_NR_Z 0
//#define BLIS_EXTEND_MR_Z 0
//#define BLIS_EXTEND_NR_Z 0
// Register blocksize extensions in the k dimension are not used.
#define BLIS_EXTEND_KR_S 0
#define BLIS_EXTEND_KR_D 0
#define BLIS_EXTEND_KR_C 0
#define BLIS_EXTEND_KR_Z 0
// -- LEVEL-2 KERNEL CONSTANTS -------------------------------------------------
// NOTE: These values determine high-level cache blocking for level-2
// operations ONLY. So, if gemv is performed with a 2000x2000 matrix A and
// MC = NC = 1000, then a total of four unblocked (or unblocked fused)
// gemv subproblems are called. The blocked algorithms are only useful in
// that they provide the opportunity for packing vectors. (Matrices can also
// be packed here, but this tends to be much too expensive in practice to
// actually employ.)
#define BLIS_DEFAULT_L2_MC_S 1000
#define BLIS_DEFAULT_L2_NC_S 1000
#define BLIS_DEFAULT_L2_MC_D 1000
#define BLIS_DEFAULT_L2_NC_D 1000
#define BLIS_DEFAULT_L2_MC_C 1000
#define BLIS_DEFAULT_L2_NC_C 1000
#define BLIS_DEFAULT_L2_MC_Z 1000
#define BLIS_DEFAULT_L2_NC_Z 1000
// -- LEVEL-1F KERNEL CONSTANTS ------------------------------------------------
// -- Default fusing factors for level-1f operations --
// NOTE: Default fusing factors are not used by the reference implementations
// of level-1f operations. They are here only for use when these operations
// are optimized.
#define BLIS_DEFAULT_FUSE_FAC_S 8
#define BLIS_DEFAULT_FUSE_FAC_D 4
#define BLIS_DEFAULT_FUSE_FAC_C 4
#define BLIS_DEFAULT_FUSE_FAC_Z 2
#define BLIS_AXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
#define BLIS_AXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
#define BLIS_AXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
#define BLIS_AXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
#define BLIS_DOTXF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
#define BLIS_DOTXF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
#define BLIS_DOTXF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
#define BLIS_DOTXF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
#define BLIS_DOTXAXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
#define BLIS_DOTXAXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
#define BLIS_DOTXAXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
#define BLIS_DOTXAXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
// -- LEVEL-1V KERNEL CONSTANTS ------------------------------------------------
// -- Default register blocksizes for vectors --
// NOTE: Register blocksizes for vectors are used when packing
// non-contiguous vectors. Similar to that of KR, they can
// typically be set to 1.
#define BLIS_DEFAULT_VR_S 1
#define BLIS_DEFAULT_VR_D 1
#define BLIS_DEFAULT_VR_C 1
#define BLIS_DEFAULT_VR_Z 1
// -- LEVEL-3 KERNEL DEFINITIONS -----------------------------------------------
#include "bli_gemm_opt_d4x4.h"
// -- gemm --
#define GEMM_UKERNEL gemm_opt_d4x4
#define BLIS_DGEMM_UKERNEL bli_dgemm_opt_d4x4
// -- trsm-related --
#define GEMMTRSM_L_UKERNEL gemmtrsm_l_ref_mxn
#define GEMMTRSM_U_UKERNEL gemmtrsm_u_ref_mxn
#define TRSM_L_UKERNEL trsm_l_ref_mxn
#define TRSM_U_UKERNEL trsm_u_ref_mxn
@@ -239,25 +163,8 @@
// -- packm --
#define PACKM_2XK_KERNEL packm_ref_2xk
#define PACKM_4XK_KERNEL packm_ref_4xk
#define PACKM_6XK_KERNEL packm_ref_6xk
#define PACKM_8XK_KERNEL packm_ref_8xk
#define PACKM_10XK_KERNEL packm_ref_10xk
#define PACKM_12XK_KERNEL packm_ref_12xk
#define PACKM_14XK_KERNEL packm_ref_14xk
#define PACKM_16XK_KERNEL packm_ref_16xk
// -- unpackm --
#define UNPACKM_2XK_KERNEL unpackm_ref_2xk
#define UNPACKM_4XK_KERNEL unpackm_ref_4xk
#define UNPACKM_6XK_KERNEL unpackm_ref_6xk
#define UNPACKM_8XK_KERNEL unpackm_ref_8xk
#define UNPACKM_10XK_KERNEL unpackm_ref_10xk
#define UNPACKM_12XK_KERNEL unpackm_ref_12xk
#define UNPACKM_14XK_KERNEL unpackm_ref_14xk
#define UNPACKM_16XK_KERNEL unpackm_ref_16xk
@@ -265,23 +172,14 @@
// -- axpy2v --
#define AXPY2V_KERNEL axpy2v_unb_var1
// -- dotaxpyv --
#define DOTAXPYV_KERNEL dotaxpyv_unb_var1
// -- axpyf --
#define AXPYF_KERNEL axpyf_unb_var1
// -- dotxf --
#define DOTXF_KERNEL dotxf_unb_var1
// -- dotxaxpyf --
#define DOTXAXPYF_KERNEL dotxaxpyf_unb_var1
@@ -289,48 +187,26 @@
// -- addv --
#define ADDV_KERNEL addv_unb_var1
// -- axpyv --
#define AXPYV_KERNEL axpyv_unb_var1
// -- copyv --
#define COPYV_KERNEL copyv_unb_var1
// -- dotv --
#define DOTV_KERNEL dotv_unb_var1
// -- dotxv --
#define DOTXV_KERNEL dotxv_unb_var1
// -- invertv --
#define INVERTV_KERNEL invertv_unb_var1
// -- scal2v --
#define SCAL2V_KERNEL scal2v_unb_var1
// -- scalv --
#define SCALV_KERNEL scalv_unb_var1
// -- setv --
#define SETV_KERNEL setv_unb_var1
// -- subv --
#define SUBV_KERNEL subv_unb_var1
// -- swapv --
#define SWAPV_KERNEL swapv_unb_var1
#endif

View File

@@ -38,7 +38,7 @@
// -- LEVEL-3 MICRO-KERNEL CONSTANTS -------------------------------------------
// -- Default cache blocksizes --
// -- Cache blocksizes --
//
// Constraints:
@@ -70,35 +70,7 @@
#define BLIS_DEFAULT_KC_Z 256
#define BLIS_DEFAULT_NC_Z 2048
// -- Cache blocksize extensions (for optimizing edge cases) --
// NOTE: These cache blocksize "extensions" have the same constraints as
// the corresponding default blocksizes above. When these values are
// non-zero, blocksizes used at edge cases are extended (enlarged) if
// such an extension would encompass the remaining portion of the
// matrix dimension.
#define BLIS_EXTEND_MC_S 0 //(BLIS_DEFAULT_MC_S/4)
#define BLIS_EXTEND_KC_S 0 //(BLIS_DEFAULT_KC_S/4)
#define BLIS_EXTEND_NC_S 0 //(BLIS_DEFAULT_NC_S/4)
#define BLIS_EXTEND_MC_D (BLIS_DEFAULT_MC_D/4)
#define BLIS_EXTEND_KC_D (BLIS_DEFAULT_KC_D/4)
#define BLIS_EXTEND_NC_D 0 //(BLIS_DEFAULT_NC_D/4)
#define BLIS_EXTEND_MC_C 0 //(BLIS_DEFAULT_MC_C/4)
#define BLIS_EXTEND_KC_C 0 //(BLIS_DEFAULT_KC_C/4)
#define BLIS_EXTEND_NC_C 0 //(BLIS_DEFAULT_NC_C/4)
#define BLIS_EXTEND_MC_Z 0 //(BLIS_DEFAULT_MC_Z/4)
#define BLIS_EXTEND_KC_Z 0 //(BLIS_DEFAULT_KC_Z/4)
#define BLIS_EXTEND_NC_Z 0 //(BLIS_DEFAULT_NC_Z/4)
// -- Default register blocksizes for micro-kernel --
// NOTE: When using the reference configuration, these register blocksizes
// in the m and n dimensions should all be equal to the size expected by
// the reference micro-kernel(s).
// -- Register blocksizes --
#define BLIS_DEFAULT_MR_S 8
#define BLIS_DEFAULT_NR_S 4
@@ -116,10 +88,34 @@
// of f, handles leftover edge cases (ie: when k % f > 0) then these
// register blocksizes in the k dimension can be defined to 1.
#define BLIS_DEFAULT_KR_S 1
#define BLIS_DEFAULT_KR_D 1
#define BLIS_DEFAULT_KR_C 1
#define BLIS_DEFAULT_KR_Z 1
//#define BLIS_DEFAULT_KR_S 1
//#define BLIS_DEFAULT_KR_D 1
//#define BLIS_DEFAULT_KR_C 1
//#define BLIS_DEFAULT_KR_Z 1
// -- Cache blocksize extensions (for optimizing edge cases) --
// NOTE: These cache blocksize "extensions" have the same constraints as
// the corresponding default blocksizes above. When these values are
// non-zero, blocksizes used at edge cases are extended (enlarged) if
// such an extension would encompass the remaining portion of the
// matrix dimension.
//#define BLIS_EXTEND_MC_S 0 //(BLIS_DEFAULT_MC_S/4)
//#define BLIS_EXTEND_KC_S 0 //(BLIS_DEFAULT_KC_S/4)
//#define BLIS_EXTEND_NC_S 0 //(BLIS_DEFAULT_NC_S/4)
#define BLIS_EXTEND_MC_D (BLIS_DEFAULT_MC_D/4)
#define BLIS_EXTEND_KC_D (BLIS_DEFAULT_KC_D/4)
#define BLIS_EXTEND_NC_D 0 //(BLIS_DEFAULT_NC_D/4)
//#define BLIS_EXTEND_MC_C 0 //(BLIS_DEFAULT_MC_C/4)
//#define BLIS_EXTEND_KC_C 0 //(BLIS_DEFAULT_KC_C/4)
//#define BLIS_EXTEND_NC_C 0 //(BLIS_DEFAULT_NC_C/4)
//#define BLIS_EXTEND_MC_Z 0 //(BLIS_DEFAULT_MC_Z/4)
//#define BLIS_EXTEND_KC_Z 0 //(BLIS_DEFAULT_KC_Z/4)
//#define BLIS_EXTEND_NC_Z 0 //(BLIS_DEFAULT_NC_Z/4)
// -- Register blocksize extensions (for packed micro-panels) --
@@ -127,93 +123,28 @@
// leading dimensions used within the packed micro-panels are equal to
// or greater than their corresponding register blocksizes above.
#define BLIS_EXTEND_MR_S 0
#define BLIS_EXTEND_NR_S 0
//#define BLIS_EXTEND_MR_S 0
//#define BLIS_EXTEND_NR_S 0
#define BLIS_EXTEND_MR_D 2
#define BLIS_EXTEND_NR_D 0
#define BLIS_EXTEND_MR_C 0
#define BLIS_EXTEND_NR_C 0
//#define BLIS_EXTEND_MR_C 0
//#define BLIS_EXTEND_NR_C 0
#define BLIS_EXTEND_MR_Z 0
#define BLIS_EXTEND_NR_Z 0
//#define BLIS_EXTEND_MR_Z 0
//#define BLIS_EXTEND_NR_Z 0
// Register blocksize extensions in the k dimension are not used.
#define BLIS_EXTEND_KR_S 0
#define BLIS_EXTEND_KR_D 0
#define BLIS_EXTEND_KR_C 0
#define BLIS_EXTEND_KR_Z 0
// -- LEVEL-2 KERNEL CONSTANTS -------------------------------------------------
// NOTE: These values determine high-level cache blocking for level-2
// operations ONLY. So, if gemv is performed with a 2000x2000 matrix A and
// MC = NC = 1000, then a total of four unblocked (or unblocked fused)
// gemv subproblems are called. The blocked algorithms are only useful in
// that they provide the opportunity for packing vectors. (Matrices can also
// be packed here, but this tends to be much too expensive in practice to
// actually employ.)
#define BLIS_DEFAULT_L2_MC_S 1000
#define BLIS_DEFAULT_L2_NC_S 1000
#define BLIS_DEFAULT_L2_MC_D 1000
#define BLIS_DEFAULT_L2_NC_D 1000
#define BLIS_DEFAULT_L2_MC_C 1000
#define BLIS_DEFAULT_L2_NC_C 1000
#define BLIS_DEFAULT_L2_MC_Z 1000
#define BLIS_DEFAULT_L2_NC_Z 1000
// -- LEVEL-1F KERNEL CONSTANTS ------------------------------------------------
// -- Default fusing factors for level-1f operations --
// NOTE: Default fusing factors are not used by the reference implementations
// of level-1f operations. They are here only for use when these operations
// are optimized.
#define BLIS_DEFAULT_FUSE_FAC_S 8
#define BLIS_DEFAULT_FUSE_FAC_D 4
#define BLIS_DEFAULT_FUSE_FAC_C 4
#define BLIS_DEFAULT_FUSE_FAC_Z 2
#define BLIS_AXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
#define BLIS_AXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
#define BLIS_AXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
#define BLIS_AXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
#define BLIS_DOTXF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
#define BLIS_DOTXF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
#define BLIS_DOTXF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
#define BLIS_DOTXF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
#define BLIS_DOTXAXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
#define BLIS_DOTXAXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
#define BLIS_DOTXAXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
#define BLIS_DOTXAXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
// -- LEVEL-1V KERNEL CONSTANTS ------------------------------------------------
// -- Default register blocksizes for vectors --
// NOTE: Register blocksizes for vectors are used when packing
// non-contiguous vectors. Similar to that of KR, they can
// typically be set to 1.
#define BLIS_DEFAULT_VR_S 1
#define BLIS_DEFAULT_VR_D 1
#define BLIS_DEFAULT_VR_C 1
#define BLIS_DEFAULT_VR_Z 1
@@ -221,17 +152,10 @@
// -- gemm --
#include "bli_gemm_opt_30x8.h"
#define GEMM_UKERNEL gemm_opt_30x8
#define BLIS_DGEMM_UKERNEL bli_dgemm_opt_30x8
// -- trsm-related --
#define GEMMTRSM_L_UKERNEL gemmtrsm_l_ref_mxn
#define GEMMTRSM_U_UKERNEL gemmtrsm_u_ref_mxn
#define TRSM_L_UKERNEL trsm_l_ref_mxn
#define TRSM_U_UKERNEL trsm_u_ref_mxn
@@ -239,25 +163,8 @@
// -- packm --
#define PACKM_2XK_KERNEL packm_ref_2xk
#define PACKM_4XK_KERNEL packm_ref_4xk
#define PACKM_6XK_KERNEL packm_ref_6xk
#define PACKM_8XK_KERNEL packm_ref_8xk
#define PACKM_10XK_KERNEL packm_ref_10xk
#define PACKM_12XK_KERNEL packm_ref_12xk
#define PACKM_14XK_KERNEL packm_ref_14xk
#define PACKM_16XK_KERNEL packm_ref_16xk
// -- unpackm --
#define UNPACKM_2XK_KERNEL unpackm_ref_2xk
#define UNPACKM_4XK_KERNEL unpackm_ref_4xk
#define UNPACKM_6XK_KERNEL unpackm_ref_6xk
#define UNPACKM_8XK_KERNEL unpackm_ref_8xk
#define UNPACKM_10XK_KERNEL unpackm_ref_10xk
#define UNPACKM_12XK_KERNEL unpackm_ref_12xk
#define UNPACKM_14XK_KERNEL unpackm_ref_14xk
#define UNPACKM_16XK_KERNEL unpackm_ref_16xk
@@ -265,23 +172,14 @@
// -- axpy2v --
#define AXPY2V_KERNEL axpy2v_unb_var1
// -- dotaxpyv --
#define DOTAXPYV_KERNEL dotaxpyv_unb_var1
// -- axpyf --
#define AXPYF_KERNEL axpyf_unb_var1
// -- dotxf --
#define DOTXF_KERNEL dotxf_unb_var1
// -- dotxaxpyf --
#define DOTXAXPYF_KERNEL dotxaxpyf_unb_var1
@@ -289,48 +187,26 @@
// -- addv --
#define ADDV_KERNEL addv_unb_var1
// -- axpyv --
#define AXPYV_KERNEL axpyv_unb_var1
// -- copyv --
#define COPYV_KERNEL copyv_unb_var1
// -- dotv --
#define DOTV_KERNEL dotv_unb_var1
// -- dotxv --
#define DOTXV_KERNEL dotxv_unb_var1
// -- invertv --
#define INVERTV_KERNEL invertv_unb_var1
// -- scal2v --
#define SCAL2V_KERNEL scal2v_unb_var1
// -- scalv --
#define SCALV_KERNEL scalv_unb_var1
// -- setv --
#define SETV_KERNEL setv_unb_var1
// -- subv --
#define SUBV_KERNEL subv_unb_var1
// -- swapv --
#define SWAPV_KERNEL swapv_unb_var1
#endif

View File

@@ -38,7 +38,7 @@
// -- LEVEL-3 MICRO-KERNEL CONSTANTS -------------------------------------------
// -- Default cache blocksizes --
// -- Cache blocksizes --
//
// Constraints:
@@ -70,35 +70,7 @@
#define BLIS_DEFAULT_KC_Z 256
#define BLIS_DEFAULT_NC_Z 2048
// -- Cache blocksize extensions (for optimizing edge cases) --
// NOTE: These cache blocksize "extensions" have the same constraints as
// the corresponding default blocksizes above. When these values are
// non-zero, blocksizes used at edge cases are extended (enlarged) if
// such an extension would encompass the remaining portion of the
// matrix dimension.
#define BLIS_EXTEND_MC_S 0 //(BLIS_DEFAULT_MC_S/4)
#define BLIS_EXTEND_KC_S 0 //(BLIS_DEFAULT_KC_S/4)
#define BLIS_EXTEND_NC_S 0 //(BLIS_DEFAULT_NC_S/4)
#define BLIS_EXTEND_MC_D 0 //(BLIS_DEFAULT_MC_D/4)
#define BLIS_EXTEND_KC_D 0 //(BLIS_DEFAULT_KC_D/4)
#define BLIS_EXTEND_NC_D 0 //(BLIS_DEFAULT_NC_D/4)
#define BLIS_EXTEND_MC_C 0 //(BLIS_DEFAULT_MC_C/4)
#define BLIS_EXTEND_KC_C 0 //(BLIS_DEFAULT_KC_C/4)
#define BLIS_EXTEND_NC_C 0 //(BLIS_DEFAULT_NC_C/4)
#define BLIS_EXTEND_MC_Z 0 //(BLIS_DEFAULT_MC_Z/4)
#define BLIS_EXTEND_KC_Z 0 //(BLIS_DEFAULT_KC_Z/4)
#define BLIS_EXTEND_NC_Z 0 //(BLIS_DEFAULT_NC_Z/4)
// -- Default register blocksizes for micro-kernel --
// NOTE: When using the reference configuration, these register blocksizes
// in the m and n dimensions should all be equal to the size expected by
// the reference micro-kernel(s).
// -- Register blocksizes --
#define BLIS_DEFAULT_MR_S 8
#define BLIS_DEFAULT_NR_S 4
@@ -116,10 +88,34 @@
// of f, handles leftover edge cases (ie: when k % f > 0) then these
// register blocksizes in the k dimension can be defined to 1.
#define BLIS_DEFAULT_KR_S 1
#define BLIS_DEFAULT_KR_D 1
#define BLIS_DEFAULT_KR_C 1
#define BLIS_DEFAULT_KR_Z 1
//#define BLIS_DEFAULT_KR_S 1
//#define BLIS_DEFAULT_KR_D 1
//#define BLIS_DEFAULT_KR_C 1
//#define BLIS_DEFAULT_KR_Z 1
// -- Cache blocksize extensions (for optimizing edge cases) --
// NOTE: These cache blocksize "extensions" have the same constraints as
// the corresponding default blocksizes above. When these values are
// non-zero, blocksizes used at edge cases are extended (enlarged) if
// such an extension would encompass the remaining portion of the
// matrix dimension.
//#define BLIS_EXTEND_MC_S 0 //(BLIS_DEFAULT_MC_S/4)
//#define BLIS_EXTEND_KC_S 0 //(BLIS_DEFAULT_KC_S/4)
//#define BLIS_EXTEND_NC_S 0 //(BLIS_DEFAULT_NC_S/4)
//#define BLIS_EXTEND_MC_D 0 //(BLIS_DEFAULT_MC_D/4)
//#define BLIS_EXTEND_KC_D 0 //(BLIS_DEFAULT_KC_D/4)
//#define BLIS_EXTEND_NC_D 0 //(BLIS_DEFAULT_NC_D/4)
//#define BLIS_EXTEND_MC_C 0 //(BLIS_DEFAULT_MC_C/4)
//#define BLIS_EXTEND_KC_C 0 //(BLIS_DEFAULT_KC_C/4)
//#define BLIS_EXTEND_NC_C 0 //(BLIS_DEFAULT_NC_C/4)
//#define BLIS_EXTEND_MC_Z 0 //(BLIS_DEFAULT_MC_Z/4)
//#define BLIS_EXTEND_KC_Z 0 //(BLIS_DEFAULT_KC_Z/4)
//#define BLIS_EXTEND_NC_Z 0 //(BLIS_DEFAULT_NC_Z/4)
// -- Register blocksize extensions (for packed micro-panels) --
@@ -127,93 +123,28 @@
// leading dimensions used within the packed micro-panels are equal to
// or greater than their corresponding register blocksizes above.
#define BLIS_EXTEND_MR_S 0
#define BLIS_EXTEND_NR_S 0
//#define BLIS_EXTEND_MR_S 0
//#define BLIS_EXTEND_NR_S 0
#define BLIS_EXTEND_MR_D 0
#define BLIS_EXTEND_NR_D 0
//#define BLIS_EXTEND_MR_D 0
//#define BLIS_EXTEND_NR_D 0
#define BLIS_EXTEND_MR_C 0
#define BLIS_EXTEND_NR_C 0
//#define BLIS_EXTEND_MR_C 0
//#define BLIS_EXTEND_NR_C 0
#define BLIS_EXTEND_MR_Z 0
#define BLIS_EXTEND_NR_Z 0
//#define BLIS_EXTEND_MR_Z 0
//#define BLIS_EXTEND_NR_Z 0
// Register blocksize extensions in the k dimension are not used.
#define BLIS_EXTEND_KR_S 0
#define BLIS_EXTEND_KR_D 0
#define BLIS_EXTEND_KR_C 0
#define BLIS_EXTEND_KR_Z 0
// -- LEVEL-2 KERNEL CONSTANTS -------------------------------------------------
// NOTE: These values determine high-level cache blocking for level-2
// operations ONLY. So, if gemv is performed with a 2000x2000 matrix A and
// MC = NC = 1000, then a total of four unblocked (or unblocked fused)
// gemv subproblems are called. The blocked algorithms are only useful in
// that they provide the opportunity for packing vectors. (Matrices can also
// be packed here, but this tends to be much too expensive in practice to
// actually employ.)
#define BLIS_DEFAULT_L2_MC_S 1000
#define BLIS_DEFAULT_L2_NC_S 1000
#define BLIS_DEFAULT_L2_MC_D 1000
#define BLIS_DEFAULT_L2_NC_D 1000
#define BLIS_DEFAULT_L2_MC_C 1000
#define BLIS_DEFAULT_L2_NC_C 1000
#define BLIS_DEFAULT_L2_MC_Z 1000
#define BLIS_DEFAULT_L2_NC_Z 1000
// -- LEVEL-1F KERNEL CONSTANTS ------------------------------------------------
// -- Default fusing factors for level-1f operations --
// NOTE: Default fusing factors are not used by the reference implementations
// of level-1f operations. They are here only for use when these operations
// are optimized.
#define BLIS_DEFAULT_FUSE_FAC_S 8
#define BLIS_DEFAULT_FUSE_FAC_D 4
#define BLIS_DEFAULT_FUSE_FAC_C 4
#define BLIS_DEFAULT_FUSE_FAC_Z 2
#define BLIS_AXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
#define BLIS_AXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
#define BLIS_AXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
#define BLIS_AXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
#define BLIS_DOTXF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
#define BLIS_DOTXF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
#define BLIS_DOTXF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
#define BLIS_DOTXF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
#define BLIS_DOTXAXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
#define BLIS_DOTXAXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
#define BLIS_DOTXAXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
#define BLIS_DOTXAXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
// -- LEVEL-1V KERNEL CONSTANTS ------------------------------------------------
// -- Default register blocksizes for vectors --
// NOTE: Register blocksizes for vectors are used when packing
// non-contiguous vectors. Similar to that of KR, they can
// typically be set to 1.
#define BLIS_DEFAULT_VR_S 1
#define BLIS_DEFAULT_VR_D 1
#define BLIS_DEFAULT_VR_C 1
#define BLIS_DEFAULT_VR_Z 1
@@ -221,17 +152,10 @@
// -- gemm --
#include "bli_gemm_4x6.h"
#define GEMM_UKERNEL gemm_4x6
#define BLIS_DGEMM_UKERNEL bli_dgemm_4x6
// -- trsm-related --
#define GEMMTRSM_L_UKERNEL gemmtrsm_l_ref_mxn
#define GEMMTRSM_U_UKERNEL gemmtrsm_u_ref_mxn
#define TRSM_L_UKERNEL trsm_l_ref_mxn
#define TRSM_U_UKERNEL trsm_u_ref_mxn
@@ -239,25 +163,8 @@
// -- packm --
#define PACKM_2XK_KERNEL packm_ref_2xk
#define PACKM_4XK_KERNEL packm_ref_4xk
#define PACKM_6XK_KERNEL packm_ref_6xk
#define PACKM_8XK_KERNEL packm_ref_8xk
#define PACKM_10XK_KERNEL packm_ref_10xk
#define PACKM_12XK_KERNEL packm_ref_12xk
#define PACKM_14XK_KERNEL packm_ref_14xk
#define PACKM_16XK_KERNEL packm_ref_16xk
// -- unpackm --
#define UNPACKM_2XK_KERNEL unpackm_ref_2xk
#define UNPACKM_4XK_KERNEL unpackm_ref_4xk
#define UNPACKM_6XK_KERNEL unpackm_ref_6xk
#define UNPACKM_8XK_KERNEL unpackm_ref_8xk
#define UNPACKM_10XK_KERNEL unpackm_ref_10xk
#define UNPACKM_12XK_KERNEL unpackm_ref_12xk
#define UNPACKM_14XK_KERNEL unpackm_ref_14xk
#define UNPACKM_16XK_KERNEL unpackm_ref_16xk
@@ -265,23 +172,14 @@
// -- axpy2v --
#define AXPY2V_KERNEL axpy2v_unb_var1
// -- dotaxpyv --
#define DOTAXPYV_KERNEL dotaxpyv_unb_var1
// -- axpyf --
#define AXPYF_KERNEL axpyf_unb_var1
// -- dotxf --
#define DOTXF_KERNEL dotxf_unb_var1
// -- dotxaxpyf --
#define DOTXAXPYF_KERNEL dotxaxpyf_unb_var1
@@ -289,48 +187,26 @@
// -- addv --
#define ADDV_KERNEL addv_unb_var1
// -- axpyv --
#define AXPYV_KERNEL axpyv_unb_var1
// -- copyv --
#define COPYV_KERNEL copyv_unb_var1
// -- dotv --
#define DOTV_KERNEL dotv_unb_var1
// -- dotxv --
#define DOTXV_KERNEL dotxv_unb_var1
// -- invertv --
#define INVERTV_KERNEL invertv_unb_var1
// -- scal2v --
#define SCAL2V_KERNEL scal2v_unb_var1
// -- scalv --
#define SCALV_KERNEL scalv_unb_var1
// -- setv --
#define SETV_KERNEL setv_unb_var1
// -- subv --
#define SUBV_KERNEL subv_unb_var1
// -- swapv --
#define SWAPV_KERNEL swapv_unb_var1
#endif

View File

@@ -38,7 +38,7 @@
// -- LEVEL-3 MICRO-KERNEL CONSTANTS -------------------------------------------
// -- Default cache blocksizes --
// -- Cache blocksizes --
//
// Constraints:
@@ -70,35 +70,7 @@
#define BLIS_DEFAULT_KC_Z 256
#define BLIS_DEFAULT_NC_Z 2048
// -- Cache blocksize extensions (for optimizing edge cases) --
// NOTE: These cache blocksize "extensions" have the same constraints as
// the corresponding default blocksizes above. When these values are
// non-zero, blocksizes used at edge cases are extended (enlarged) if
// such an extension would encompass the remaining portion of the
// matrix dimension.
#define BLIS_EXTEND_MC_S 0 //(BLIS_DEFAULT_MC_S/4)
#define BLIS_EXTEND_KC_S 0 //(BLIS_DEFAULT_KC_S/4)
#define BLIS_EXTEND_NC_S 0 //(BLIS_DEFAULT_NC_S/4)
#define BLIS_EXTEND_MC_D 0 //(BLIS_DEFAULT_MC_D/4)
#define BLIS_EXTEND_KC_D 0 //(BLIS_DEFAULT_KC_D/4)
#define BLIS_EXTEND_NC_D 0 //(BLIS_DEFAULT_NC_D/4)
#define BLIS_EXTEND_MC_C 0 //(BLIS_DEFAULT_MC_C/4)
#define BLIS_EXTEND_KC_C 0 //(BLIS_DEFAULT_KC_C/4)
#define BLIS_EXTEND_NC_C 0 //(BLIS_DEFAULT_NC_C/4)
#define BLIS_EXTEND_MC_Z 0 //(BLIS_DEFAULT_MC_Z/4)
#define BLIS_EXTEND_KC_Z 0 //(BLIS_DEFAULT_KC_Z/4)
#define BLIS_EXTEND_NC_Z 0 //(BLIS_DEFAULT_NC_Z/4)
// -- Default register blocksizes for micro-kernel --
// NOTE: When using the reference configuration, these register blocksizes
// in the m and n dimensions should all be equal to the size expected by
// the reference micro-kernel(s).
// -- Register blocksizes --
#define BLIS_DEFAULT_MR_S 8
#define BLIS_DEFAULT_NR_S 4
@@ -116,10 +88,34 @@
// of f, handles leftover edge cases (ie: when k % f > 0) then these
// register blocksizes in the k dimension can be defined to 1.
#define BLIS_DEFAULT_KR_S 1
#define BLIS_DEFAULT_KR_D 1
#define BLIS_DEFAULT_KR_C 1
#define BLIS_DEFAULT_KR_Z 1
//#define BLIS_DEFAULT_KR_S 1
//#define BLIS_DEFAULT_KR_D 1
//#define BLIS_DEFAULT_KR_C 1
//#define BLIS_DEFAULT_KR_Z 1
// -- Cache blocksize extensions (for optimizing edge cases) --
// NOTE: These cache blocksize "extensions" have the same constraints as
// the corresponding default blocksizes above. When these values are
// non-zero, blocksizes used at edge cases are extended (enlarged) if
// such an extension would encompass the remaining portion of the
// matrix dimension.
//#define BLIS_EXTEND_MC_S 0 //(BLIS_DEFAULT_MC_S/4)
//#define BLIS_EXTEND_KC_S 0 //(BLIS_DEFAULT_KC_S/4)
//#define BLIS_EXTEND_NC_S 0 //(BLIS_DEFAULT_NC_S/4)
//#define BLIS_EXTEND_MC_D 0 //(BLIS_DEFAULT_MC_D/4)
//#define BLIS_EXTEND_KC_D 0 //(BLIS_DEFAULT_KC_D/4)
//#define BLIS_EXTEND_NC_D 0 //(BLIS_DEFAULT_NC_D/4)
//#define BLIS_EXTEND_MC_C 0 //(BLIS_DEFAULT_MC_C/4)
//#define BLIS_EXTEND_KC_C 0 //(BLIS_DEFAULT_KC_C/4)
//#define BLIS_EXTEND_NC_C 0 //(BLIS_DEFAULT_NC_C/4)
//#define BLIS_EXTEND_MC_Z 0 //(BLIS_DEFAULT_MC_Z/4)
//#define BLIS_EXTEND_KC_Z 0 //(BLIS_DEFAULT_KC_Z/4)
//#define BLIS_EXTEND_NC_Z 0 //(BLIS_DEFAULT_NC_Z/4)
// -- Register blocksize extensions (for packed micro-panels) --
@@ -127,93 +123,28 @@
// leading dimensions used within the packed micro-panels are equal to
// or greater than their corresponding register blocksizes above.
#define BLIS_EXTEND_MR_S 0
#define BLIS_EXTEND_NR_S 0
//#define BLIS_EXTEND_MR_S 0
//#define BLIS_EXTEND_NR_S 0
#define BLIS_EXTEND_MR_D 0
#define BLIS_EXTEND_NR_D 0
//#define BLIS_EXTEND_MR_D 0
//#define BLIS_EXTEND_NR_D 0
#define BLIS_EXTEND_MR_C 0
#define BLIS_EXTEND_NR_C 0
//#define BLIS_EXTEND_MR_C 0
//#define BLIS_EXTEND_NR_C 0
#define BLIS_EXTEND_MR_Z 0
#define BLIS_EXTEND_NR_Z 0
//#define BLIS_EXTEND_MR_Z 0
//#define BLIS_EXTEND_NR_Z 0
// Register blocksize extensions in the k dimension are not used.
#define BLIS_EXTEND_KR_S 0
#define BLIS_EXTEND_KR_D 0
#define BLIS_EXTEND_KR_C 0
#define BLIS_EXTEND_KR_Z 0
// -- LEVEL-2 KERNEL CONSTANTS -------------------------------------------------
// NOTE: These values determine high-level cache blocking for level-2
// operations ONLY. So, if gemv is performed with a 2000x2000 matrix A and
// MC = NC = 1000, then a total of four unblocked (or unblocked fused)
// gemv subproblems are called. The blocked algorithms are only useful in
// that they provide the opportunity for packing vectors. (Matrices can also
// be packed here, but this tends to be much too expensive in practice to
// actually employ.)
#define BLIS_DEFAULT_L2_MC_S 1000
#define BLIS_DEFAULT_L2_NC_S 1000
#define BLIS_DEFAULT_L2_MC_D 1000
#define BLIS_DEFAULT_L2_NC_D 1000
#define BLIS_DEFAULT_L2_MC_C 1000
#define BLIS_DEFAULT_L2_NC_C 1000
#define BLIS_DEFAULT_L2_MC_Z 1000
#define BLIS_DEFAULT_L2_NC_Z 1000
// -- LEVEL-1F KERNEL CONSTANTS ------------------------------------------------
// -- Default fusing factors for level-1f operations --
// NOTE: Default fusing factors are not used by the reference implementations
// of level-1f operations. They are here only for use when these operations
// are optimized.
#define BLIS_DEFAULT_FUSE_FAC_S 8
#define BLIS_DEFAULT_FUSE_FAC_D 4
#define BLIS_DEFAULT_FUSE_FAC_C 4
#define BLIS_DEFAULT_FUSE_FAC_Z 2
#define BLIS_AXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
#define BLIS_AXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
#define BLIS_AXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
#define BLIS_AXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
#define BLIS_DOTXF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
#define BLIS_DOTXF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
#define BLIS_DOTXF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
#define BLIS_DOTXF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
#define BLIS_DOTXAXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
#define BLIS_DOTXAXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
#define BLIS_DOTXAXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
#define BLIS_DOTXAXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
// -- LEVEL-1V KERNEL CONSTANTS ------------------------------------------------
// -- Default register blocksizes for vectors --
// NOTE: Register blocksizes for vectors are used when packing
// non-contiguous vectors. Similar to that of KR, they can
// typically be set to 1.
#define BLIS_DEFAULT_VR_S 1
#define BLIS_DEFAULT_VR_D 1
#define BLIS_DEFAULT_VR_C 1
#define BLIS_DEFAULT_VR_Z 1
@@ -221,18 +152,12 @@
// -- gemm --
//#define GEMM_UKERNEL gemm_ref_mxn
#include "bli_gemm_opt_8x4.h"
#define GEMM_UKERNEL gemm_opt_8x4
#define BLIS_DGEMM_UKERNEL bli_dgemm_opt_8x4
// -- trsm-related --
#define GEMMTRSM_L_UKERNEL gemmtrsm_l_ref_mxn
#define GEMMTRSM_U_UKERNEL gemmtrsm_u_ref_mxn
#define TRSM_L_UKERNEL trsm_l_ref_mxn
#define TRSM_U_UKERNEL trsm_u_ref_mxn
@@ -240,25 +165,8 @@
// -- packm --
#define PACKM_2XK_KERNEL packm_ref_2xk
#define PACKM_4XK_KERNEL packm_ref_4xk
#define PACKM_6XK_KERNEL packm_ref_6xk
#define PACKM_8XK_KERNEL packm_ref_8xk
#define PACKM_10XK_KERNEL packm_ref_10xk
#define PACKM_12XK_KERNEL packm_ref_12xk
#define PACKM_14XK_KERNEL packm_ref_14xk
#define PACKM_16XK_KERNEL packm_ref_16xk
// -- unpackm --
#define UNPACKM_2XK_KERNEL unpackm_ref_2xk
#define UNPACKM_4XK_KERNEL unpackm_ref_4xk
#define UNPACKM_6XK_KERNEL unpackm_ref_6xk
#define UNPACKM_8XK_KERNEL unpackm_ref_8xk
#define UNPACKM_10XK_KERNEL unpackm_ref_10xk
#define UNPACKM_12XK_KERNEL unpackm_ref_12xk
#define UNPACKM_14XK_KERNEL unpackm_ref_14xk
#define UNPACKM_16XK_KERNEL unpackm_ref_16xk
@@ -266,23 +174,14 @@
// -- axpy2v --
#define AXPY2V_KERNEL axpy2v_unb_var1
// -- dotaxpyv --
#define DOTAXPYV_KERNEL dotaxpyv_unb_var1
// -- axpyf --
#define AXPYF_KERNEL axpyf_unb_var1
// -- dotxf --
#define DOTXF_KERNEL dotxf_unb_var1
// -- dotxaxpyf --
#define DOTXAXPYF_KERNEL dotxaxpyf_unb_var1
@@ -290,48 +189,26 @@
// -- addv --
#define ADDV_KERNEL addv_unb_var1
// -- axpyv --
#define AXPYV_KERNEL axpyv_unb_var1
// -- copyv --
#define COPYV_KERNEL copyv_unb_var1
// -- dotv --
#define DOTV_KERNEL dotv_unb_var1
// -- dotxv --
#define DOTXV_KERNEL dotxv_unb_var1
// -- invertv --
#define INVERTV_KERNEL invertv_unb_var1
// -- scal2v --
#define SCAL2V_KERNEL scal2v_unb_var1
// -- scalv --
#define SCALV_KERNEL scalv_unb_var1
// -- setv --
#define SETV_KERNEL setv_unb_var1
// -- subv --
#define SUBV_KERNEL subv_unb_var1
// -- swapv --
#define SWAPV_KERNEL swapv_unb_var1
#endif

View File

@@ -35,300 +35,8 @@
#ifndef BLIS_KERNEL_H
#define BLIS_KERNEL_H
// -- LEVEL-3 MICRO-KERNEL CONSTANTS -------------------------------------------
// -- Default cache blocksizes --
//
// Constraints:
//
// (1) MC must be a multiple of:
// (a) MR (for zero-padding purposes)
// (b) NR (for zero-padding purposes when MR and NR are "swapped")
// (2) NC must be a multiple of
// (a) NR (for zero-padding purposes)
// (b) MR (for zero-padding purposes when MR and NR are "swapped")
// (3) KC must be a multiple of
// (a) MR and
// (b) NR (for triangular operations such as trmm and trsm).
//
#define BLIS_DEFAULT_MC_S 64
#define BLIS_DEFAULT_KC_S 128
#define BLIS_DEFAULT_NC_S 4096
#define BLIS_DEFAULT_MC_D 64
#define BLIS_DEFAULT_KC_D 128
#define BLIS_DEFAULT_NC_D 4096
#define BLIS_DEFAULT_MC_C 64
#define BLIS_DEFAULT_KC_C 128
#define BLIS_DEFAULT_NC_C 4096
#define BLIS_DEFAULT_MC_Z 64
#define BLIS_DEFAULT_KC_Z 128
#define BLIS_DEFAULT_NC_Z 4096
// -- Cache blocksize extensions (for optimizing edge cases) --
// NOTE: These cache blocksize "extensions" have the same constraints as
// the corresponding default blocksizes above. When these values are
// non-zero, blocksizes used at edge cases are extended (enlarged) if
// such an extension would encompass the remaining portion of the
// matrix dimension.
#define BLIS_EXTEND_MC_S 0 //(BLIS_DEFAULT_MC_S/4)
#define BLIS_EXTEND_KC_S 0 //(BLIS_DEFAULT_KC_S/4)
#define BLIS_EXTEND_NC_S 0 //(BLIS_DEFAULT_NC_S/4)
#define BLIS_EXTEND_MC_D 0 //(BLIS_DEFAULT_MC_D/4)
#define BLIS_EXTEND_KC_D 0 //(BLIS_DEFAULT_KC_D/4)
#define BLIS_EXTEND_NC_D 0 //(BLIS_DEFAULT_NC_D/4)
#define BLIS_EXTEND_MC_C 0 //(BLIS_DEFAULT_MC_C/4)
#define BLIS_EXTEND_KC_C 0 //(BLIS_DEFAULT_KC_C/4)
#define BLIS_EXTEND_NC_C 0 //(BLIS_DEFAULT_NC_C/4)
#define BLIS_EXTEND_MC_Z 0 //(BLIS_DEFAULT_MC_Z/4)
#define BLIS_EXTEND_KC_Z 0 //(BLIS_DEFAULT_KC_Z/4)
#define BLIS_EXTEND_NC_Z 0 //(BLIS_DEFAULT_NC_Z/4)
// -- Default register blocksizes for micro-kernel --
// NOTE: When using the reference configuration, these register blocksizes
// in the m and n dimensions should all be equal to the size expected by
// the reference micro-kernel(s).
#define BLIS_DEFAULT_MR_S 8
#define BLIS_DEFAULT_NR_S 4
#define BLIS_DEFAULT_MR_D 8
#define BLIS_DEFAULT_NR_D 4
#define BLIS_DEFAULT_MR_C 8
#define BLIS_DEFAULT_NR_C 4
#define BLIS_DEFAULT_MR_Z 8
#define BLIS_DEFAULT_NR_Z 4
// NOTE: If the micro-kernel, which is typically unrolled to a factor
// of f, handles leftover edge cases (ie: when k % f > 0) then these
// register blocksizes in the k dimension can be defined to 1.
#define BLIS_DEFAULT_KR_S 1
#define BLIS_DEFAULT_KR_D 1
#define BLIS_DEFAULT_KR_C 1
#define BLIS_DEFAULT_KR_Z 1
// -- Register blocksize extensions (for packed micro-panels) --
// NOTE: These register blocksize "extensions" determine whether the
// leading dimensions used within the packed micro-panels are equal to
// or greater than their corresponding register blocksizes above.
#define BLIS_EXTEND_MR_S 0
#define BLIS_EXTEND_NR_S 0
#define BLIS_EXTEND_MR_D 0
#define BLIS_EXTEND_NR_D 0
#define BLIS_EXTEND_MR_C 0
#define BLIS_EXTEND_NR_C 0
#define BLIS_EXTEND_MR_Z 0
#define BLIS_EXTEND_NR_Z 0
// Register blocksize extensions in the k dimension are not used.
#define BLIS_EXTEND_KR_S 0
#define BLIS_EXTEND_KR_D 0
#define BLIS_EXTEND_KR_C 0
#define BLIS_EXTEND_KR_Z 0
// -- LEVEL-2 KERNEL CONSTANTS -------------------------------------------------
// NOTE: These values determine high-level cache blocking for level-2
// operations ONLY. So, if gemv is performed with a 2000x2000 matrix A and
// MC = NC = 1000, then a total of four unblocked (or unblocked fused)
// gemv subproblems are called. The blocked algorithms are only useful in
// that they provide the opportunity for packing vectors. (Matrices can also
// be packed here, but this tends to be much too expensive in practice to
// actually employ.)
#define BLIS_DEFAULT_L2_MC_S 1000
#define BLIS_DEFAULT_L2_NC_S 1000
#define BLIS_DEFAULT_L2_MC_D 1000
#define BLIS_DEFAULT_L2_NC_D 1000
#define BLIS_DEFAULT_L2_MC_C 1000
#define BLIS_DEFAULT_L2_NC_C 1000
#define BLIS_DEFAULT_L2_MC_Z 1000
#define BLIS_DEFAULT_L2_NC_Z 1000
// -- LEVEL-1F KERNEL CONSTANTS ------------------------------------------------
// -- Default fusing factors for level-1f operations --
// NOTE: Default fusing factors are not used by the reference implementations
// of level-1f operations. They are here only for use when these operations
// are optimized.
#define BLIS_DEFAULT_FUSE_FAC_S 8
#define BLIS_DEFAULT_FUSE_FAC_D 4
#define BLIS_DEFAULT_FUSE_FAC_C 4
#define BLIS_DEFAULT_FUSE_FAC_Z 2
#define BLIS_AXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
#define BLIS_AXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
#define BLIS_AXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
#define BLIS_AXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
#define BLIS_DOTXF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
#define BLIS_DOTXF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
#define BLIS_DOTXF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
#define BLIS_DOTXF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
#define BLIS_DOTXAXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
#define BLIS_DOTXAXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
#define BLIS_DOTXAXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
#define BLIS_DOTXAXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
// -- LEVEL-1V KERNEL CONSTANTS ------------------------------------------------
// -- Default register blocksizes for vectors --
// NOTE: Register blocksizes for vectors are used when packing
// non-contiguous vectors. Similar to that of KR, they can
// typically be set to 1.
#define BLIS_DEFAULT_VR_S 1
#define BLIS_DEFAULT_VR_D 1
#define BLIS_DEFAULT_VR_C 1
#define BLIS_DEFAULT_VR_Z 1
// -- LEVEL-3 KERNEL DEFINITIONS -----------------------------------------------
// -- gemm --
#define GEMM_UKERNEL gemm_ref_mxn
// -- trsm-related --
#define GEMMTRSM_L_UKERNEL gemmtrsm_l_ref_mxn
#define GEMMTRSM_U_UKERNEL gemmtrsm_u_ref_mxn
#define TRSM_L_UKERNEL trsm_l_ref_mxn
#define TRSM_U_UKERNEL trsm_u_ref_mxn
// -- LEVEL-1M KERNEL DEFINITIONS ----------------------------------------------
// -- packm --
#define PACKM_2XK_KERNEL packm_ref_2xk
#define PACKM_4XK_KERNEL packm_ref_4xk
#define PACKM_6XK_KERNEL packm_ref_6xk
#define PACKM_8XK_KERNEL packm_ref_8xk
#define PACKM_10XK_KERNEL packm_ref_10xk
#define PACKM_12XK_KERNEL packm_ref_12xk
#define PACKM_14XK_KERNEL packm_ref_14xk
#define PACKM_16XK_KERNEL packm_ref_16xk
// -- unpackm --
#define UNPACKM_2XK_KERNEL unpackm_ref_2xk
#define UNPACKM_4XK_KERNEL unpackm_ref_4xk
#define UNPACKM_6XK_KERNEL unpackm_ref_6xk
#define UNPACKM_8XK_KERNEL unpackm_ref_8xk
#define UNPACKM_10XK_KERNEL unpackm_ref_10xk
#define UNPACKM_12XK_KERNEL unpackm_ref_12xk
#define UNPACKM_14XK_KERNEL unpackm_ref_14xk
#define UNPACKM_16XK_KERNEL unpackm_ref_16xk
// -- LEVEL-1F KERNEL DEFINITIONS ----------------------------------------------
// -- axpy2v --
#define AXPY2V_KERNEL axpy2v_unb_var1
// -- dotaxpyv --
#define DOTAXPYV_KERNEL dotaxpyv_unb_var1
// -- axpyf --
#define AXPYF_KERNEL axpyf_unb_var1
// -- dotxf --
#define DOTXF_KERNEL dotxf_unb_var1
// -- dotxaxpyf --
#define DOTXAXPYF_KERNEL dotxaxpyf_unb_var1
// -- LEVEL-1V KERNEL DEFINITIONS ----------------------------------------------
// -- addv --
#define ADDV_KERNEL addv_unb_var1
// -- axpyv --
#define AXPYV_KERNEL axpyv_unb_var1
// -- copyv --
#define COPYV_KERNEL copyv_unb_var1
// -- dotv --
#define DOTV_KERNEL dotv_unb_var1
// -- dotxv --
#define DOTXV_KERNEL dotxv_unb_var1
// -- invertv --
#define INVERTV_KERNEL invertv_unb_var1
// -- scal2v --
#define SCAL2V_KERNEL scal2v_unb_var1
// -- scalv --
#define SCALV_KERNEL scalv_unb_var1
// -- setv --
#define SETV_KERNEL setv_unb_var1
// -- subv --
#define SUBV_KERNEL subv_unb_var1
// -- swapv --
#define SWAPV_KERNEL swapv_unb_var1
// In the reference configuration, we let all of the defaults take
// effect. Thus, no definitions are needed.
#endif

View File

@@ -38,7 +38,7 @@
// -- LEVEL-3 MICRO-KERNEL CONSTANTS -------------------------------------------
// -- Default cache blocksizes --
// -- Cache blocksizes --
//
// Constraints:
@@ -70,35 +70,7 @@
#define BLIS_DEFAULT_KC_Z 128
#define BLIS_DEFAULT_NC_Z 4096
// -- Cache blocksize extensions (for optimizing edge cases) --
// NOTE: These cache blocksize "extensions" have the same constraints as
// the corresponding default blocksizes above. When these values are
// non-zero, blocksizes used at edge cases are extended (enlarged) if
// such an extension would encompass the remaining portion of the
// matrix dimension.
#define BLIS_EXTEND_MC_S 0 //(BLIS_DEFAULT_MC_S/4)
#define BLIS_EXTEND_KC_S 0 //(BLIS_DEFAULT_KC_S/4)
#define BLIS_EXTEND_NC_S 0 //(BLIS_DEFAULT_NC_S/4)
#define BLIS_EXTEND_MC_D 0 //(BLIS_DEFAULT_MC_D/4)
#define BLIS_EXTEND_KC_D 0 //(BLIS_DEFAULT_KC_D/4)
#define BLIS_EXTEND_NC_D 0 //(BLIS_DEFAULT_NC_D/4)
#define BLIS_EXTEND_MC_C 0 //(BLIS_DEFAULT_MC_C/4)
#define BLIS_EXTEND_KC_C 0 //(BLIS_DEFAULT_KC_C/4)
#define BLIS_EXTEND_NC_C 0 //(BLIS_DEFAULT_NC_C/4)
#define BLIS_EXTEND_MC_Z 0 //(BLIS_DEFAULT_MC_Z/4)
#define BLIS_EXTEND_KC_Z 0 //(BLIS_DEFAULT_KC_Z/4)
#define BLIS_EXTEND_NC_Z 0 //(BLIS_DEFAULT_NC_Z/4)
// -- Default register blocksizes for micro-kernel --
// NOTE: When using the reference configuration, these register blocksizes
// in the m and n dimensions should all be equal to the size expected by
// the reference micro-kernel(s).
// -- Register blocksizes --
#define BLIS_DEFAULT_MR_S 8
#define BLIS_DEFAULT_NR_S 4
@@ -116,10 +88,34 @@
// of f, handles leftover edge cases (ie: when k % f > 0) then these
// register blocksizes in the k dimension can be defined to 1.
#define BLIS_DEFAULT_KR_S 1
#define BLIS_DEFAULT_KR_D 1
#define BLIS_DEFAULT_KR_C 1
#define BLIS_DEFAULT_KR_Z 1
//#define BLIS_DEFAULT_KR_S 1
//#define BLIS_DEFAULT_KR_D 1
//#define BLIS_DEFAULT_KR_C 1
//#define BLIS_DEFAULT_KR_Z 1
// -- Cache blocksize extensions (for optimizing edge cases) --
// NOTE: These cache blocksize "extensions" have the same constraints as
// the corresponding default blocksizes above. When these values are
// non-zero, blocksizes used at edge cases are extended (enlarged) if
// such an extension would encompass the remaining portion of the
// matrix dimension.
//#define BLIS_EXTEND_MC_S 0 //(BLIS_DEFAULT_MC_S/4)
//#define BLIS_EXTEND_KC_S 0 //(BLIS_DEFAULT_KC_S/4)
//#define BLIS_EXTEND_NC_S 0 //(BLIS_DEFAULT_NC_S/4)
//#define BLIS_EXTEND_MC_D 0 //(BLIS_DEFAULT_MC_D/4)
//#define BLIS_EXTEND_KC_D 0 //(BLIS_DEFAULT_KC_D/4)
//#define BLIS_EXTEND_NC_D 0 //(BLIS_DEFAULT_NC_D/4)
//#define BLIS_EXTEND_MC_C 0 //(BLIS_DEFAULT_MC_C/4)
//#define BLIS_EXTEND_KC_C 0 //(BLIS_DEFAULT_KC_C/4)
//#define BLIS_EXTEND_NC_C 0 //(BLIS_DEFAULT_NC_C/4)
//#define BLIS_EXTEND_MC_Z 0 //(BLIS_DEFAULT_MC_Z/4)
//#define BLIS_EXTEND_KC_Z 0 //(BLIS_DEFAULT_KC_Z/4)
//#define BLIS_EXTEND_NC_Z 0 //(BLIS_DEFAULT_NC_Z/4)
// -- Register blocksize extensions (for packed micro-panels) --
@@ -127,111 +123,39 @@
// leading dimensions used within the packed micro-panels are equal to
// or greater than their corresponding register blocksizes above.
#define BLIS_EXTEND_MR_S 0
#define BLIS_EXTEND_NR_S 0
//#define BLIS_EXTEND_MR_S 0
//#define BLIS_EXTEND_NR_S 0
#define BLIS_EXTEND_MR_D 0
#define BLIS_EXTEND_NR_D 0
//#define BLIS_EXTEND_MR_D 0
//#define BLIS_EXTEND_NR_D 0
#define BLIS_EXTEND_MR_C 0
#define BLIS_EXTEND_NR_C 0
//#define BLIS_EXTEND_MR_C 0
//#define BLIS_EXTEND_NR_C 0
#define BLIS_EXTEND_MR_Z 0
#define BLIS_EXTEND_NR_Z 0
//#define BLIS_EXTEND_MR_Z 0
//#define BLIS_EXTEND_NR_Z 0
// Register blocksize extensions in the k dimension are not used.
#define BLIS_EXTEND_KR_S 0
#define BLIS_EXTEND_KR_D 0
#define BLIS_EXTEND_KR_C 0
#define BLIS_EXTEND_KR_Z 0
// -- LEVEL-2 KERNEL CONSTANTS -------------------------------------------------
// NOTE: These values determine high-level cache blocking for level-2
// operations ONLY. So, if gemv is performed with a 2000x2000 matrix A and
// MC = NC = 1000, then a total of four unblocked (or unblocked fused)
// gemv subproblems are called. The blocked algorithms are only useful in
// that they provide the opportunity for packing vectors. (Matrices can also
// be packed here, but this tends to be much too expensive in practice to
// actually employ.)
#define BLIS_DEFAULT_L2_MC_S 1000
#define BLIS_DEFAULT_L2_NC_S 1000
#define BLIS_DEFAULT_L2_MC_D 1000
#define BLIS_DEFAULT_L2_NC_D 1000
#define BLIS_DEFAULT_L2_MC_C 1000
#define BLIS_DEFAULT_L2_NC_C 1000
#define BLIS_DEFAULT_L2_MC_Z 1000
#define BLIS_DEFAULT_L2_NC_Z 1000
// -- LEVEL-1F KERNEL CONSTANTS ------------------------------------------------
// -- Default fusing factors for level-1f operations --
// NOTE: Default fusing factors are not used by the reference implementations
// of level-1f operations. They are here only for use when these operations
// are optimized.
#define BLIS_DEFAULT_FUSE_FAC_S 8
#define BLIS_DEFAULT_FUSE_FAC_D 4
#define BLIS_DEFAULT_FUSE_FAC_C 4
#define BLIS_DEFAULT_FUSE_FAC_Z 2
#define BLIS_AXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
#define BLIS_AXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
#define BLIS_AXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
#define BLIS_AXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
#define BLIS_DOTXF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
#define BLIS_DOTXF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
#define BLIS_DOTXF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
#define BLIS_DOTXF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
#define BLIS_DOTXAXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
#define BLIS_DOTXAXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
#define BLIS_DOTXAXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
#define BLIS_DOTXAXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
// -- LEVEL-1V KERNEL CONSTANTS ------------------------------------------------
// -- Default register blocksizes for vectors --
// NOTE: Register blocksizes for vectors are used when packing
// non-contiguous vectors. Similar to that of KR, they can
// typically be set to 1.
#define BLIS_DEFAULT_VR_S 1
#define BLIS_DEFAULT_VR_D 1
#define BLIS_DEFAULT_VR_C 1
#define BLIS_DEFAULT_VR_Z 1
// -- LEVEL-3 KERNEL DEFINITIONS -----------------------------------------------
#include "bli_gemm_opt_8x4_ref_u4_nodupl_avx1.h"
// -- gemm --
#define GEMM_UKERNEL gemm_opt_8x4_ref_u4_nodupl_avx1
#define BLIS_DGEMM_UKERNEL bli_dgemm_opt_8x4_ref_u4_nodupl_avx1
// -- trsm-related --
#define GEMMTRSM_L_UKERNEL gemmtrsm_l_ref_mxn
#define GEMMTRSM_U_UKERNEL gemmtrsm_u_ref_mxn
#define TRSM_L_UKERNEL trsm_l_ref_mxn
#define TRSM_U_UKERNEL trsm_u_ref_mxn
@@ -239,25 +163,8 @@
// -- packm --
#define PACKM_2XK_KERNEL packm_ref_2xk
#define PACKM_4XK_KERNEL packm_ref_4xk
#define PACKM_6XK_KERNEL packm_ref_6xk
#define PACKM_8XK_KERNEL packm_ref_8xk
#define PACKM_10XK_KERNEL packm_ref_10xk
#define PACKM_12XK_KERNEL packm_ref_12xk
#define PACKM_14XK_KERNEL packm_ref_14xk
#define PACKM_16XK_KERNEL packm_ref_16xk
// -- unpackm --
#define UNPACKM_2XK_KERNEL unpackm_ref_2xk
#define UNPACKM_4XK_KERNEL unpackm_ref_4xk
#define UNPACKM_6XK_KERNEL unpackm_ref_6xk
#define UNPACKM_8XK_KERNEL unpackm_ref_8xk
#define UNPACKM_10XK_KERNEL unpackm_ref_10xk
#define UNPACKM_12XK_KERNEL unpackm_ref_12xk
#define UNPACKM_14XK_KERNEL unpackm_ref_14xk
#define UNPACKM_16XK_KERNEL unpackm_ref_16xk
@@ -265,23 +172,14 @@
// -- axpy2v --
#define AXPY2V_KERNEL axpy2v_unb_var1
// -- dotaxpyv --
#define DOTAXPYV_KERNEL dotaxpyv_unb_var1
// -- axpyf --
#define AXPYF_KERNEL axpyf_unb_var1
// -- dotxf --
#define DOTXF_KERNEL dotxf_unb_var1
// -- dotxaxpyf --
#define DOTXAXPYF_KERNEL dotxaxpyf_unb_var1
@@ -289,48 +187,26 @@
// -- addv --
#define ADDV_KERNEL addv_unb_var1
// -- axpyv --
#define AXPYV_KERNEL axpyv_unb_var1
// -- copyv --
#define COPYV_KERNEL copyv_unb_var1
// -- dotv --
#define DOTV_KERNEL dotv_unb_var1
// -- dotxv --
#define DOTXV_KERNEL dotxv_unb_var1
// -- invertv --
#define INVERTV_KERNEL invertv_unb_var1
// -- scal2v --
#define SCAL2V_KERNEL scal2v_unb_var1
// -- scalv --
#define SCALV_KERNEL scalv_unb_var1
// -- setv --
#define SETV_KERNEL setv_unb_var1
// -- subv --
#define SUBV_KERNEL subv_unb_var1
// -- swapv --
#define SWAPV_KERNEL swapv_unb_var1
#endif

View File

@@ -38,9 +38,8 @@
// -- LEVEL-3 MICRO-KERNEL CONSTANTS -------------------------------------------
// -- Default cache blocksizes --
// -- Cache blocksizes --
//
// Constraints:
//
// (1) MC must be a multiple of:
@@ -52,53 +51,24 @@
// (3) KC must be a multiple of
// (a) MR and
// (b) NR (for triangular operations such as trmm and trsm).
//
#define BLIS_DEFAULT_MC_S 64
#define BLIS_DEFAULT_KC_S 128
#define BLIS_DEFAULT_NC_S 4096
#define BLIS_DEFAULT_MC_S 128
#define BLIS_DEFAULT_KC_S 256
#define BLIS_DEFAULT_NC_S 2048
#define BLIS_DEFAULT_MC_D 64
#define BLIS_DEFAULT_KC_D 128
#define BLIS_DEFAULT_NC_D 4096
#define BLIS_DEFAULT_MC_D 128
#define BLIS_DEFAULT_KC_D 256
#define BLIS_DEFAULT_NC_D 2048
#define BLIS_DEFAULT_MC_C 64
#define BLIS_DEFAULT_KC_C 128
#define BLIS_DEFAULT_NC_C 4096
#define BLIS_DEFAULT_MC_C 128
#define BLIS_DEFAULT_KC_C 256
#define BLIS_DEFAULT_NC_C 2048
#define BLIS_DEFAULT_MC_Z 64
#define BLIS_DEFAULT_KC_Z 128
#define BLIS_DEFAULT_NC_Z 4096
#define BLIS_DEFAULT_MC_Z 128
#define BLIS_DEFAULT_KC_Z 256
#define BLIS_DEFAULT_NC_Z 2048
// -- Cache blocksize extensions (for optimizing edge cases) --
// NOTE: These cache blocksize "extensions" have the same constraints as
// the corresponding default blocksizes above. When these values are
// non-zero, blocksizes used at edge cases are extended (enlarged) if
// such an extension would encompass the remaining portion of the
// matrix dimension.
#define BLIS_EXTEND_MC_S 0 //(BLIS_DEFAULT_MC_S/4)
#define BLIS_EXTEND_KC_S 0 //(BLIS_DEFAULT_KC_S/4)
#define BLIS_EXTEND_NC_S 0 //(BLIS_DEFAULT_NC_S/4)
#define BLIS_EXTEND_MC_D 0 //(BLIS_DEFAULT_MC_D/4)
#define BLIS_EXTEND_KC_D 0 //(BLIS_DEFAULT_KC_D/4)
#define BLIS_EXTEND_NC_D 0 //(BLIS_DEFAULT_NC_D/4)
#define BLIS_EXTEND_MC_C 0 //(BLIS_DEFAULT_MC_C/4)
#define BLIS_EXTEND_KC_C 0 //(BLIS_DEFAULT_KC_C/4)
#define BLIS_EXTEND_NC_C 0 //(BLIS_DEFAULT_NC_C/4)
#define BLIS_EXTEND_MC_Z 0 //(BLIS_DEFAULT_MC_Z/4)
#define BLIS_EXTEND_KC_Z 0 //(BLIS_DEFAULT_KC_Z/4)
#define BLIS_EXTEND_NC_Z 0 //(BLIS_DEFAULT_NC_Z/4)
// -- Default register blocksizes for micro-kernel --
// NOTE: When using the reference configuration, these register blocksizes
// in the m and n dimensions should all be equal to the size expected by
// the reference micro-kernel(s).
// -- Register blocksizes --
#define BLIS_DEFAULT_MR_S 8
#define BLIS_DEFAULT_NR_S 4
@@ -116,10 +86,34 @@
// of f, handles leftover edge cases (ie: when k % f > 0) then these
// register blocksizes in the k dimension can be defined to 1.
#define BLIS_DEFAULT_KR_S 1
#define BLIS_DEFAULT_KR_D 1
#define BLIS_DEFAULT_KR_C 1
#define BLIS_DEFAULT_KR_Z 1
//#define BLIS_DEFAULT_KR_S 1
//#define BLIS_DEFAULT_KR_D 1
//#define BLIS_DEFAULT_KR_C 1
//#define BLIS_DEFAULT_KR_Z 1
// -- Cache blocksize extensions (for optimizing edge cases) --
// NOTE: These cache blocksize "extensions" have the same constraints as
// the corresponding default blocksizes above. When these values are
// non-zero, blocksizes used at edge cases are extended (enlarged) if
// such an extension would encompass the remaining portion of the
// matrix dimension.
//#define BLIS_EXTEND_MC_S 0
//#define BLIS_EXTEND_KC_S 0
//#define BLIS_EXTEND_NC_S 0
//#define BLIS_EXTEND_MC_D 0
//#define BLIS_EXTEND_KC_D 0
//#define BLIS_EXTEND_NC_D 0
//#define BLIS_EXTEND_MC_C 0
//#define BLIS_EXTEND_KC_C 0
//#define BLIS_EXTEND_NC_C 0
//#define BLIS_EXTEND_MC_Z 0
//#define BLIS_EXTEND_KC_Z 0
//#define BLIS_EXTEND_NC_Z 0
// -- Register blocksize extensions (for packed micro-panels) --
@@ -127,24 +121,52 @@
// leading dimensions used within the packed micro-panels are equal to
// or greater than their corresponding register blocksizes above.
#define BLIS_EXTEND_MR_S 0
#define BLIS_EXTEND_NR_S 0
//#define BLIS_EXTEND_MR_S 0
//#define BLIS_EXTEND_NR_S 0
#define BLIS_EXTEND_MR_D 0
#define BLIS_EXTEND_NR_D 0
//#define BLIS_EXTEND_MR_D 0
//#define BLIS_EXTEND_NR_D 0
#define BLIS_EXTEND_MR_C 0
#define BLIS_EXTEND_NR_C 0
//#define BLIS_EXTEND_MR_C 0
//#define BLIS_EXTEND_NR_C 0
#define BLIS_EXTEND_MR_Z 0
#define BLIS_EXTEND_NR_Z 0
//#define BLIS_EXTEND_MR_Z 0
//#define BLIS_EXTEND_NR_Z 0
// Register blocksize extensions in the k dimension are not used.
#define BLIS_EXTEND_KR_S 0
#define BLIS_EXTEND_KR_D 0
#define BLIS_EXTEND_KR_C 0
#define BLIS_EXTEND_KR_Z 0
// -- LEVEL-3 MICRO-KERNELS ---------------------------------------------------
// -- gemm --
#define BLIS_SGEMM_UKERNEL bli_sgemm_opt_mxn
#define BLIS_DGEMM_UKERNEL bli_dgemm_opt_mxn
#define BLIS_CGEMM_UKERNEL bli_cgemm_opt_mxn
#define BLIS_ZGEMM_UKERNEL bli_zgemm_opt_mxn
// -- trsm-related --
#define BLIS_SGEMMTRSM_L_UKERNEL bli_sgemmtrsm_l_opt_mxn
#define BLIS_DGEMMTRSM_L_UKERNEL bli_dgemmtrsm_l_opt_mxn
#define BLIS_CGEMMTRSM_L_UKERNEL bli_cgemmtrsm_l_opt_mxn
#define BLIS_ZGEMMTRSM_L_UKERNEL bli_zgemmtrsm_l_opt_mxn
#define BLIS_SGEMMTRSM_U_UKERNEL bli_sgemmtrsm_u_opt_mxn
#define BLIS_DGEMMTRSM_U_UKERNEL bli_dgemmtrsm_u_opt_mxn
#define BLIS_CGEMMTRSM_U_UKERNEL bli_cgemmtrsm_u_opt_mxn
#define BLIS_ZGEMMTRSM_U_UKERNEL bli_zgemmtrsm_u_opt_mxn
#define BLIS_STRSM_L_UKERNEL bli_strsm_l_opt_mxn
#define BLIS_DTRSM_L_UKERNEL bli_dtrsm_l_opt_mxn
#define BLIS_CTRSM_L_UKERNEL bli_ctrsm_l_opt_mxn
#define BLIS_ZTRSM_L_UKERNEL bli_ztrsm_l_opt_mxn
#define BLIS_STRSM_U_UKERNEL bli_strsm_u_opt_mxn
#define BLIS_DTRSM_U_UKERNEL bli_dtrsm_u_opt_mxn
#define BLIS_CTRSM_U_UKERNEL bli_ctrsm_u_opt_mxn
#define BLIS_ZTRSM_U_UKERNEL bli_ztrsm_u_opt_mxn
@@ -158,17 +180,18 @@
// be packed here, but this tends to be much too expensive in practice to
// actually employ.)
#define BLIS_DEFAULT_L2_MC_S 1000
#define BLIS_DEFAULT_L2_NC_S 1000
//#define BLIS_DEFAULT_L2_MC_S 1000
//#define BLIS_DEFAULT_L2_NC_S 1000
#define BLIS_DEFAULT_L2_MC_D 1000
#define BLIS_DEFAULT_L2_NC_D 1000
//#define BLIS_DEFAULT_L2_MC_D 1000
//#define BLIS_DEFAULT_L2_NC_D 1000
#define BLIS_DEFAULT_L2_MC_C 1000
#define BLIS_DEFAULT_L2_NC_C 1000
//#define BLIS_DEFAULT_L2_MC_C 1000
//#define BLIS_DEFAULT_L2_NC_C 1000
//#define BLIS_DEFAULT_L2_MC_Z 1000
//#define BLIS_DEFAULT_L2_NC_Z 1000
#define BLIS_DEFAULT_L2_MC_Z 1000
#define BLIS_DEFAULT_L2_NC_Z 1000
@@ -176,66 +199,67 @@
// -- Default fusing factors for level-1f operations --
// NOTE: Default fusing factors are not used by the reference implementations
// of level-1f operations. They are here only for use when these operations
// are optimized.
//#define BLIS_L1F_FUSE_FAC_S 8
//#define BLIS_L1F_FUSE_FAC_D 4
//#define BLIS_L1F_FUSE_FAC_C 4
//#define BLIS_L1F_FUSE_FAC_Z 2
#define BLIS_DEFAULT_FUSE_FAC_S 8
#define BLIS_DEFAULT_FUSE_FAC_D 4
#define BLIS_DEFAULT_FUSE_FAC_C 4
#define BLIS_DEFAULT_FUSE_FAC_Z 2
//#define BLIS_AXPYF_FUSE_FAC_S BLIS_L1F_FUSE_FAC_S
//#define BLIS_AXPYF_FUSE_FAC_D BLIS_L1F_FUSE_FAC_D
//#define BLIS_AXPYF_FUSE_FAC_C BLIS_L1F_FUSE_FAC_C
//#define BLIS_AXPYF_FUSE_FAC_Z BLIS_L1F_FUSE_FAC_Z
#define BLIS_AXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
#define BLIS_AXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
#define BLIS_AXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
#define BLIS_AXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
//#define BLIS_DOTXF_FUSE_FAC_S BLIS_L1F_FUSE_FAC_S
//#define BLIS_DOTXF_FUSE_FAC_D BLIS_L1F_FUSE_FAC_D
//#define BLIS_DOTXF_FUSE_FAC_C BLIS_L1F_FUSE_FAC_C
//#define BLIS_DOTXF_FUSE_FAC_Z BLIS_L1F_FUSE_FAC_Z
#define BLIS_DOTXF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
#define BLIS_DOTXF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
#define BLIS_DOTXF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
#define BLIS_DOTXF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
#define BLIS_DOTXAXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
#define BLIS_DOTXAXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
#define BLIS_DOTXAXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
#define BLIS_DOTXAXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
//#define BLIS_DOTXAXPYF_FUSE_FAC_S BLIS_L1F_FUSE_FAC_S
//#define BLIS_DOTXAXPYF_FUSE_FAC_D BLIS_L1F_FUSE_FAC_D
//#define BLIS_DOTXAXPYF_FUSE_FAC_C BLIS_L1F_FUSE_FAC_C
//#define BLIS_DOTXAXPYF_FUSE_FAC_Z BLIS_L1F_FUSE_FAC_Z
// -- LEVEL-1V KERNEL CONSTANTS ------------------------------------------------
// -- Default register blocksizes for vectors --
// -- LEVEL-1F KERNEL DEFINITIONS ----------------------------------------------
// NOTE: Register blocksizes for vectors are used when packing
// non-contiguous vectors. Similar to that of KR, they can
// typically be set to 1.
// -- axpy2v --
#define BLIS_DEFAULT_VR_S 1
#define BLIS_DEFAULT_VR_D 1
#define BLIS_DEFAULT_VR_C 1
#define BLIS_DEFAULT_VR_Z 1
#define BLIS_SAXPY2V_KERNEL bli_saxpy2v_opt_var1
#define BLIS_DAXPY2V_KERNEL bli_daxpy2v_opt_var1
#define BLIS_CAXPY2V_KERNEL bli_caxpy2v_opt_var1
#define BLIS_ZAXPY2V_KERNEL bli_zaxpy2v_opt_var1
// -- dotaxpyv --
#define BLIS_SDOTAXPYV_KERNEL bli_sdotaxpyv_opt_var1
#define BLIS_DDOTAXPYV_KERNEL bli_ddotaxpyv_opt_var1
#define BLIS_CDOTAXPYV_KERNEL bli_cdotaxpyv_opt_var1
#define BLIS_ZDOTAXPYV_KERNEL bli_zdotaxpyv_opt_var1
// -- axpyf --
#define BLIS_SAXPYF_KERNEL bli_saxpyf_opt_var1
#define BLIS_DAXPYF_KERNEL bli_daxpyf_opt_var1
#define BLIS_CAXPYF_KERNEL bli_caxpyf_opt_var1
#define BLIS_ZAXPYF_KERNEL bli_zaxpyf_opt_var1
// -- dotxf --
#define BLIS_SDOTXF_KERNEL bli_sdotxf_opt_var1
#define BLIS_DDOTXF_KERNEL bli_ddotxf_opt_var1
#define BLIS_CDOTXF_KERNEL bli_cdotxf_opt_var1
#define BLIS_ZDOTXF_KERNEL bli_zdotxf_opt_var1
// -- dotxaxpyf --
// -- LEVEL-3 KERNEL DEFINITIONS -----------------------------------------------
#define BLIS_SDOTXAXPYF_KERNEL bli_sdotxaxpyf_opt_var1
#define BLIS_DDOTXAXPYF_KERNEL bli_ddotxaxpyf_opt_var1
#define BLIS_CDOTXAXPYF_KERNEL bli_cdotxaxpyf_opt_var1
#define BLIS_ZDOTXAXPYF_KERNEL bli_zdotxaxpyf_opt_var1
#include "bli_gemm_opt_mxn.h"
#include "bli_trsm_l_opt_mxn.h"
#include "bli_trsm_u_opt_mxn.h"
#include "bli_gemmtrsm_l_opt_mxn.h"
#include "bli_gemmtrsm_u_opt_mxn.h"
// -- gemm --
#define GEMM_UKERNEL gemm_opt_mxn
// -- trsm-related --
#define GEMMTRSM_L_UKERNEL gemmtrsm_l_opt_mxn
#define GEMMTRSM_U_UKERNEL gemmtrsm_u_opt_mxn
#define TRSM_L_UKERNEL trsm_l_opt_mxn
#define TRSM_U_UKERNEL trsm_u_opt_mxn
@@ -243,55 +267,30 @@
// -- packm --
#define PACKM_2XK_KERNEL packm_ref_2xk
#define PACKM_4XK_KERNEL packm_ref_4xk
#define PACKM_6XK_KERNEL packm_ref_6xk
#define PACKM_8XK_KERNEL packm_ref_8xk
#define PACKM_10XK_KERNEL packm_ref_10xk
#define PACKM_12XK_KERNEL packm_ref_12xk
#define PACKM_14XK_KERNEL packm_ref_14xk
#define PACKM_16XK_KERNEL packm_ref_16xk
//#define BLIS_SPACKM_2XK_KERNEL bli_spackm_ref_2xk
//#define BLIS_DPACKM_2XK_KERNEL bli_dpackm_ref_2xk
//#define BLIS_CPACKM_2XK_KERNEL bli_cpackm_ref_2xk
//#define BLIS_ZPACKM_2XK_KERNEL bli_zpackm_ref_2xk
// -- unpackm --
//#define BLIS_SPACKM_4XK_KERNEL bli_spackm_ref_4xk
//#define BLIS_DPACKM_4XK_KERNEL bli_dpackm_ref_4xk
//#define BLIS_CPACKM_4XK_KERNEL bli_cpackm_ref_4xk
//#define BLIS_ZPACKM_4XK_KERNEL bli_zpackm_ref_4xk
#define UNPACKM_2XK_KERNEL unpackm_ref_2xk
#define UNPACKM_4XK_KERNEL unpackm_ref_4xk
#define UNPACKM_6XK_KERNEL unpackm_ref_6xk
#define UNPACKM_8XK_KERNEL unpackm_ref_8xk
#define UNPACKM_10XK_KERNEL unpackm_ref_10xk
#define UNPACKM_12XK_KERNEL unpackm_ref_12xk
#define UNPACKM_14XK_KERNEL unpackm_ref_14xk
#define UNPACKM_16XK_KERNEL unpackm_ref_16xk
//#define BLIS_SPACKM_6XK_KERNEL bli_spackm_ref_6xk
//#define BLIS_DPACKM_6XK_KERNEL bli_dpackm_ref_6xk
//#define BLIS_CPACKM_6XK_KERNEL bli_cpackm_ref_6xk
//#define BLIS_ZPACKM_6XK_KERNEL bli_zpackm_ref_6xk
//#define BLIS_SPACKM_8XK_KERNEL bli_spackm_ref_8xk
//#define BLIS_DPACKM_8XK_KERNEL bli_dpackm_ref_8xk
//#define BLIS_CPACKM_8XK_KERNEL bli_cpackm_ref_8xk
//#define BLIS_ZPACKM_8XK_KERNEL bli_zpackm_ref_8xk
// ...
// -- LEVEL-1F KERNEL DEFINITIONS ----------------------------------------------
// (Commented definitions for 10, 12, 14, and 16 not shown).
#include "bli_axpy2v_opt_var1.h"
#include "bli_dotaxpyv_opt_var1.h"
#include "bli_axpyf_opt_var1.h"
#include "bli_dotxf_opt_var1.h"
#include "bli_dotxaxpyf_opt_var1.h"
// -- axpy2v --
#define AXPY2V_KERNEL axpy2v_opt_var1
// -- dotaxpyv --
#define DOTAXPYV_KERNEL dotaxpyv_opt_var1
// -- axpyf --
#define AXPYF_KERNEL axpyf_opt_var1
// -- dotxf --
#define DOTXF_KERNEL dotxf_opt_var1
// -- dotxaxpyf --
#define DOTXAXPYF_KERNEL dotxaxpyf_opt_var1
@@ -299,47 +298,81 @@
// -- addv --
#define ADDV_KERNEL addv_unb_var1
//#define BLIS_SADDV_KERNEL bli_saddv_unb_var1
//#define BLIS_DADDV_KERNEL bli_daddv_unb_var1
//#define BLIS_CADDV_KERNEL bli_caddv_unb_var1
//#define BLIS_ZADDV_KERNEL bli_zaddv_unb_var1
// -- axpyv --
#define AXPYV_KERNEL axpyv_unb_var1
#define BLIS_SAXPYV_KERNEL bli_saxpyv_opt_var1
#define BLIS_DAXPYV_KERNEL bli_daxpyv_opt_var1
#define BLIS_CAXPYV_KERNEL bli_caxpyv_opt_var1
#define BLIS_ZAXPYV_KERNEL bli_zaxpyv_opt_var1
// -- copyv --
#define COPYV_KERNEL copyv_unb_var1
//#define BLIS_SCOPYV_KERNEL bli_scopyv_unb_var1
//#define BLIS_DCOPYV_KERNEL bli_dcopyv_unb_var1
//#define BLIS_CCOPYV_KERNEL bli_ccopyv_unb_var1
//#define BLIS_ZCOPYV_KERNEL bli_zcopyv_unb_var1
// -- dotv --
#define DOTV_KERNEL dotv_unb_var1
#define BLIS_SDOTV_KERNEL bli_sdotv_opt_var1
#define BLIS_DDOTV_KERNEL bli_ddotv_opt_var1
#define BLIS_CDOTV_KERNEL bli_cdotv_opt_var1
#define BLIS_ZDOTV_KERNEL bli_zdotv_opt_var1
// -- dotxv --
#define DOTXV_KERNEL dotxv_unb_var1
//#define BLIS_SDOTXV_KERNEL bli_sdotxv_unb_var1
//#define BLIS_DDOTXV_KERNEL bli_ddotxv_unb_var1
//#define BLIS_CDOTXV_KERNEL bli_cdotxv_unb_var1
//#define BLIS_ZDOTXV_KERNEL bli_zdotxv_unb_var1
// -- invertv --
#define INVERTV_KERNEL invertv_unb_var1
//#define BLIS_SINVERTV_KERNEL bli_sinvertv_unb_var1
//#define BLIS_DINVERTV_KERNEL bli_dinvertv_unb_var1
//#define BLIS_CINVERTV_KERNEL bli_cinvertv_unb_var1
//#define BLIS_ZINVERTV_KERNEL bli_zinvertv_unb_var1
// -- scal2v --
#define SCAL2V_KERNEL scal2v_unb_var1
//#define BLIS_SSCAL2V_KERNEL bli_sscal2v_unb_var1
//#define BLIS_DSCAL2V_KERNEL bli_dscal2v_unb_var1
//#define BLIS_CSCAL2V_KERNEL bli_cscal2v_unb_var1
//#define BLIS_ZSCAL2V_KERNEL bli_zscal2v_unb_var1
// -- scalv --
#define SCALV_KERNEL scalv_unb_var1
//#define BLIS_SSCALV_KERNEL bli_sscalv_unb_var1
//#define BLIS_DSCALV_KERNEL bli_dscalv_unb_var1
//#define BLIS_CSCALV_KERNEL bli_cscalv_unb_var1
//#define BLIS_ZSCALV_KERNEL bli_zscalv_unb_var1
// -- setv --
#define SETV_KERNEL setv_unb_var1
//#define BLIS_SSETV_KERNEL bli_ssetv_unb_var1
//#define BLIS_DSETV_KERNEL bli_dsetv_unb_var1
//#define BLIS_CSETV_KERNEL bli_csetv_unb_var1
//#define BLIS_ZSETV_KERNEL bli_zsetv_unb_var1
// -- subv --
#define SUBV_KERNEL subv_unb_var1
//#define BLIS_SSUBV_KERNEL bli_ssubv_unb_var1
//#define BLIS_DSUBV_KERNEL bli_dsubv_unb_var1
//#define BLIS_CSUBV_KERNEL bli_csubv_unb_var1
//#define BLIS_ZSUBV_KERNEL bli_zsubv_unb_var1
// -- swapv --
#define SWAPV_KERNEL swapv_unb_var1
//#define BLIS_SSWAPV_KERNEL bli_sswapv_unb_var1
//#define BLIS_DSWAPV_KERNEL bli_dswapv_unb_var1
//#define BLIS_CSWAPV_KERNEL bli_cswapv_unb_var1
//#define BLIS_ZSWAPV_KERNEL bli_zswapv_unb_var1

View File

@@ -36,59 +36,59 @@
void bli_sssaxpyv_opt_var1( conj_t conjx,
dim_t n,
float* restrict alpha,
float* restrict x, inc_t incx,
float* restrict y, inc_t incy )
void bli_saxpyv_opt_var1( conj_t conjx,
dim_t n,
float* restrict alpha,
float* restrict x, inc_t incx,
float* restrict y, inc_t incy )
{
/* Just call the reference implementation. */
bli_sssaxpyv_unb_var1( conjx,
n,
alpha,
x, incx,
y, incy );
BLIS_SAXPYV_KERNEL_REF( conjx,
n,
alpha,
x, incx,
y, incy );
}
void bli_dddaxpyv_opt_var1( conj_t conjx,
dim_t n,
double* restrict alpha,
double* restrict x, inc_t incx,
double* restrict y, inc_t incy )
void bli_daxpyv_opt_var1( conj_t conjx,
dim_t n,
double* restrict alpha,
double* restrict x, inc_t incx,
double* restrict y, inc_t incy )
{
/* Just call the reference implementation. */
bli_dddaxpyv_unb_var1( conjx,
n,
alpha,
x, incx,
y, incy );
BLIS_DAXPYV_KERNEL_REF( conjx,
n,
alpha,
x, incx,
y, incy );
}
void bli_cccaxpyv_opt_var1( conj_t conjx,
dim_t n,
scomplex* restrict alpha,
scomplex* restrict x, inc_t incx,
scomplex* restrict y, inc_t incy )
void bli_caxpyv_opt_var1( conj_t conjx,
dim_t n,
scomplex* restrict alpha,
scomplex* restrict x, inc_t incx,
scomplex* restrict y, inc_t incy )
{
/* Just call the reference implementation. */
bli_cccaxpyv_unb_var1( conjx,
n,
alpha,
x, incx,
y, incy );
BLIS_CAXPYV_KERNEL_REF( conjx,
n,
alpha,
x, incx,
y, incy );
}
void bli_zzzaxpyv_opt_var1( conj_t conjx,
dim_t n,
dcomplex* restrict alpha,
dcomplex* restrict x, inc_t incx,
dcomplex* restrict y, inc_t incy )
void bli_zaxpyv_opt_var1( conj_t conjx,
dim_t n,
dcomplex* restrict alpha,
dcomplex* restrict x, inc_t incx,
dcomplex* restrict y, inc_t incy )
{
/*
Template axpyv kernel implementation
@@ -193,11 +193,11 @@ void bli_zzzaxpyv_opt_var1( conj_t conjx,
// Call the reference implementation if needed.
if ( use_ref == TRUE )
{
bli_zzzaxpyv_unb_var1( conjx,
n,
alpha,
x, incx,
y, incy );
BLIS_ZAXPYV_KERNEL_REF( conjx,
n,
alpha,
x, incx,
y, incy );
return;
}
@@ -272,37 +272,3 @@ void bli_zzzaxpyv_opt_var1( conj_t conjx,
}
}
//
// Define BLAS-like interfaces with heterogeneous-typed operands.
//
#undef GENTFUNC3
#define GENTFUNC3( ctype_a, ctype_x, ctype_y, cha, chx, chy, opname, varname ) \
\
void PASTEMAC3(cha,chx,chy,opname)( \
conj_t conjx, \
dim_t n, \
ctype_a* restrict alpha, \
ctype_x* restrict x, inc_t incx, \
ctype_y* restrict y, inc_t incy \
) \
{ \
/* Just call the reference implementation. */ \
PASTEMAC3(cha,chx,chy,varname)( conjx, \
n, \
alpha, \
x, incx, \
y, incy ); \
}
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
INSERT_GENTFUNC3_MIX_D( axpyv_opt_var1, axpyv_unb_var1 )
#endif
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
INSERT_GENTFUNC3_MIX_P( axpyv_opt_var1, axpyv_unb_var1 )
#endif

View File

@@ -36,66 +36,66 @@
void bli_sssdotv_opt_var1( conj_t conjx,
conj_t conjy,
dim_t n,
float* restrict x, inc_t incx,
float* restrict y, inc_t incy,
float* restrict rho )
void bli_sdotv_opt_var1( conj_t conjx,
conj_t conjy,
dim_t n,
float* restrict x, inc_t incx,
float* restrict y, inc_t incy,
float* restrict rho )
{
/* Just call the reference implementation. */
bli_sssdotv_unb_var1( conjx,
conjy,
n,
x, incx,
y, incy,
rho );
BLIS_SDOTV_KERNEL_REF( conjx,
conjy,
n,
x, incx,
y, incy,
rho );
}
void bli_ddddotv_opt_var1( conj_t conjx,
conj_t conjy,
dim_t n,
double* restrict x, inc_t incx,
double* restrict y, inc_t incy,
double* restrict rho )
void bli_ddotv_opt_var1( conj_t conjx,
conj_t conjy,
dim_t n,
double* restrict x, inc_t incx,
double* restrict y, inc_t incy,
double* restrict rho )
{
/* Just call the reference implementation. */
bli_ddddotv_unb_var1( conjx,
conjy,
n,
x, incx,
y, incy,
rho );
BLIS_DDOTV_KERNEL_REF( conjx,
conjy,
n,
x, incx,
y, incy,
rho );
}
void bli_cccdotv_opt_var1( conj_t conjx,
conj_t conjy,
dim_t n,
scomplex* restrict x, inc_t incx,
scomplex* restrict y, inc_t incy,
scomplex* restrict rho )
void bli_cdotv_opt_var1( conj_t conjx,
conj_t conjy,
dim_t n,
scomplex* restrict x, inc_t incx,
scomplex* restrict y, inc_t incy,
scomplex* restrict rho )
{
/* Just call the reference implementation. */
bli_cccdotv_unb_var1( conjx,
conjy,
n,
x, incx,
y, incy,
rho );
BLIS_CDOTV_KERNEL_REF( conjx,
conjy,
n,
x, incx,
y, incy,
rho );
}
void bli_zzzdotv_opt_var1( conj_t conjx,
conj_t conjy,
dim_t n,
dcomplex* restrict x, inc_t incx,
dcomplex* restrict y, inc_t incy,
dcomplex* restrict rho )
void bli_zdotv_opt_var1( conj_t conjx,
conj_t conjy,
dim_t n,
dcomplex* restrict x, inc_t incx,
dcomplex* restrict y, inc_t incy,
dcomplex* restrict rho )
{
/*
Template dotv kernel implementation
@@ -210,12 +210,12 @@ void bli_zzzdotv_opt_var1( conj_t conjx,
// Call the reference implementation if needed.
if ( use_ref == TRUE )
{
bli_zzzdotv_unb_var1( conjx,
conjy,
n,
x, incx,
y, incy,
rho );
BLIS_ZDOTV_KERNEL_REF( conjx,
conjy,
n,
x, incx,
y, incy,
rho );
return;
}
@@ -310,36 +310,3 @@ void bli_zzzdotv_opt_var1( conj_t conjx,
bli_zzcopys( dotxy, *rho );
}
//
// Define BLAS-like interfaces with heterogeneous-typed operands.
//
#undef GENTFUNC3
#define GENTFUNC3( ctype_x, ctype_y, ctype_r, chx, chy, chr, opname, varname ) \
\
void PASTEMAC3(chx,chy,chr,opname)( \
conj_t conjx, \
conj_t conjy, \
dim_t n, \
ctype_x* restrict x, inc_t incx, \
ctype_y* restrict y, inc_t incy, \
ctype_r* restrict rho \
) \
{ \
/* Just call the reference implementation. */ \
PASTEMAC3(chx,chy,chr,varname)( conjx, \
conjy, \
n, \
x, incx, \
y, incy, \
rho ); \
}
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
INSERT_GENTFUNC3_MIX_D( dotv_opt_var1, dotv_unb_var1 )
#endif
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
INSERT_GENTFUNC3_MIX_P( dotv_opt_var1, dotv_unb_var1 )
#endif

View File

@@ -1,59 +0,0 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Prototype dotv kernel interfaces.
//
#undef GENTPROT3
#define GENTPROT3( ctype_x, ctype_y, ctype_r, chx, chy, chr, varname ) \
\
void PASTEMAC3(chx,chy,chr,varname)( \
conj_t conjx, \
conj_t conjy, \
dim_t n, \
ctype_x* restrict x, inc_t incx, \
ctype_y* restrict y, inc_t incy, \
ctype_r* restrict rho \
);
INSERT_GENTPROT3_BASIC( dotv_opt_var1 )
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
INSERT_GENTPROT3_MIX_D( dotv_opt_var1 )
#endif
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
INSERT_GENTPROT3_MIX_P( dotv_opt_var1 )
#endif

View File

@@ -36,88 +36,88 @@
void bli_sssaxpy2v_opt_var1(
conj_t conjx,
conj_t conjy,
dim_t n,
float* restrict alpha1,
float* restrict alpha2,
float* restrict x, inc_t incx,
float* restrict y, inc_t incy,
float* restrict z, inc_t incz
)
void bli_saxpy2v_opt_var1(
conj_t conjx,
conj_t conjy,
dim_t n,
float* restrict alpha1,
float* restrict alpha2,
float* restrict x, inc_t incx,
float* restrict y, inc_t incy,
float* restrict z, inc_t incz
)
{
/* Just call the reference implementation. */
bli_sssaxpy2v_unb_var1( conjx,
conjy,
n,
alpha1,
alpha2,
x, incx,
y, incy,
z, incz );
BLIS_SAXPY2V_KERNEL_REF( conjx,
conjy,
n,
alpha1,
alpha2,
x, incx,
y, incy,
z, incz );
}
void bli_dddaxpy2v_opt_var1(
conj_t conjx,
conj_t conjy,
dim_t n,
double* restrict alpha1,
double* restrict alpha2,
double* restrict x, inc_t incx,
double* restrict y, inc_t incy,
double* restrict z, inc_t incz
)
void bli_daxpy2v_opt_var1(
conj_t conjx,
conj_t conjy,
dim_t n,
double* restrict alpha1,
double* restrict alpha2,
double* restrict x, inc_t incx,
double* restrict y, inc_t incy,
double* restrict z, inc_t incz
)
{
/* Just call the reference implementation. */
bli_dddaxpy2v_unb_var1( conjx,
conjy,
n,
alpha1,
alpha2,
x, incx,
y, incy,
z, incz );
BLIS_DAXPY2V_KERNEL_REF( conjx,
conjy,
n,
alpha1,
alpha2,
x, incx,
y, incy,
z, incz );
}
void bli_cccaxpy2v_opt_var1(
conj_t conjx,
conj_t conjy,
dim_t n,
scomplex* restrict alpha1,
scomplex* restrict alpha2,
scomplex* restrict x, inc_t incx,
scomplex* restrict y, inc_t incy,
scomplex* restrict z, inc_t incz
)
void bli_caxpy2v_opt_var1(
conj_t conjx,
conj_t conjy,
dim_t n,
scomplex* restrict alpha1,
scomplex* restrict alpha2,
scomplex* restrict x, inc_t incx,
scomplex* restrict y, inc_t incy,
scomplex* restrict z, inc_t incz
)
{
/* Just call the reference implementation. */
bli_cccaxpy2v_unb_var1( conjx,
conjy,
n,
alpha1,
alpha2,
x, incx,
y, incy,
z, incz );
BLIS_CAXPY2V_KERNEL_REF( conjx,
conjy,
n,
alpha1,
alpha2,
x, incx,
y, incy,
z, incz );
}
void bli_zzzaxpy2v_opt_var1(
conj_t conjx,
conj_t conjy,
dim_t n,
dcomplex* restrict alpha1,
dcomplex* restrict alpha2,
dcomplex* restrict x, inc_t incx,
dcomplex* restrict y, inc_t incy,
dcomplex* restrict z, inc_t incz
)
void bli_zaxpy2v_opt_var1(
conj_t conjx,
conj_t conjy,
dim_t n,
dcomplex* restrict alpha1,
dcomplex* restrict alpha2,
dcomplex* restrict x, inc_t incx,
dcomplex* restrict y, inc_t incy,
dcomplex* restrict z, inc_t incz
)
{
/*
Template axpy2v kernel implementation
@@ -229,14 +229,14 @@ void bli_zzzaxpy2v_opt_var1(
// Call the reference implementation if needed.
if ( use_ref == TRUE )
{
bli_zzzaxpy2v_unb_var1( conjx,
conjy,
n,
alpha1,
alpha2,
x, incx,
y, incy,
z, incz );
BLIS_ZAXPY2V_KERNEL_REF( conjx,
conjy,
n,
alpha1,
alpha2,
x, incx,
y, incy,
z, incz );
return;
}
@@ -396,41 +396,3 @@ void bli_zzzaxpy2v_opt_var1(
}
}
//
// Define BLAS-like interfaces with heterogeneous-typed operands.
//
#undef GENTFUNC3U12
#define GENTFUNC3U12( ctype_x, ctype_y, ctype_z, ctype_xy, chx, chy, chz, chxy, varname, kername ) \
\
void PASTEMAC3(chx,chy,chz,varname)( \
conj_t conjx, \
conj_t conjy, \
dim_t n, \
ctype_xy* restrict alpha1, \
ctype_xy* restrict alpha2, \
ctype_x* restrict x, inc_t incx, \
ctype_y* restrict y, inc_t incy, \
ctype_z* restrict z, inc_t incz \
) \
{ \
/* Just call the reference implementation. */ \
PASTEMAC3(chx,chy,chz,kername)( conjx, \
conjy, \
n, \
alpha1, \
alpha2, \
x, incx, \
y, incy, \
z, incz ); \
}
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
INSERT_GENTFUNC3U12_MIX_D( axpy2v_opt_var1, axpy2v_unb_var1 )
#endif
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
INSERT_GENTFUNC3U12_MIX_P( axpy2v_opt_var1, axpy2v_unb_var1 )
#endif

View File

@@ -36,87 +36,87 @@
void bli_sssaxpyf_opt_var1(
conj_t conja,
conj_t conjx,
dim_t m,
dim_t b_n,
float* restrict alpha,
float* restrict a, inc_t inca, inc_t lda,
float* restrict x, inc_t incx,
float* restrict y, inc_t incy
)
void bli_saxpyf_opt_var1(
conj_t conja,
conj_t conjx,
dim_t m,
dim_t b_n,
float* restrict alpha,
float* restrict a, inc_t inca, inc_t lda,
float* restrict x, inc_t incx,
float* restrict y, inc_t incy
)
{
/* Just call the reference implementation. */
bli_sssaxpyf_unb_var1( conja,
conjx,
m,
b_n,
alpha,
a, inca, lda,
x, incx,
y, incy );
BLIS_SAXPYF_KERNEL_REF( conja,
conjx,
m,
b_n,
alpha,
a, inca, lda,
x, incx,
y, incy );
}
void bli_dddaxpyf_opt_var1(
conj_t conja,
conj_t conjx,
dim_t m,
dim_t b_n,
double* restrict alpha,
double* restrict a, inc_t inca, inc_t lda,
double* restrict x, inc_t incx,
double* restrict y, inc_t incy
)
void bli_daxpyf_opt_var1(
conj_t conja,
conj_t conjx,
dim_t m,
dim_t b_n,
double* restrict alpha,
double* restrict a, inc_t inca, inc_t lda,
double* restrict x, inc_t incx,
double* restrict y, inc_t incy
)
{
/* Just call the reference implementation. */
bli_dddaxpyf_unb_var1( conja,
conjx,
m,
b_n,
alpha,
a, inca, lda,
x, incx,
y, incy );
BLIS_DAXPYF_KERNEL_REF( conja,
conjx,
m,
b_n,
alpha,
a, inca, lda,
x, incx,
y, incy );
}
void bli_cccaxpyf_opt_var1(
conj_t conja,
conj_t conjx,
dim_t m,
dim_t b_n,
scomplex* restrict alpha,
scomplex* restrict a, inc_t inca, inc_t lda,
scomplex* restrict x, inc_t incx,
scomplex* restrict y, inc_t incy
)
void bli_caxpyf_opt_var1(
conj_t conja,
conj_t conjx,
dim_t m,
dim_t b_n,
scomplex* restrict alpha,
scomplex* restrict a, inc_t inca, inc_t lda,
scomplex* restrict x, inc_t incx,
scomplex* restrict y, inc_t incy
)
{
/* Just call the reference implementation. */
bli_cccaxpyf_unb_var1( conja,
conjx,
m,
b_n,
alpha,
a, inca, lda,
x, incx,
y, incy );
BLIS_CAXPYF_KERNEL_REF( conja,
conjx,
m,
b_n,
alpha,
a, inca, lda,
x, incx,
y, incy );
}
void bli_zzzaxpyf_opt_var1(
conj_t conja,
conj_t conjx,
dim_t m,
dim_t b_n,
dcomplex* restrict alpha,
dcomplex* restrict a, inc_t inca, inc_t lda,
dcomplex* restrict x, inc_t incx,
dcomplex* restrict y, inc_t incy
)
void bli_zaxpyf_opt_var1(
conj_t conja,
conj_t conjx,
dim_t m,
dim_t b_n,
dcomplex* restrict alpha,
dcomplex* restrict a, inc_t inca, inc_t lda,
dcomplex* restrict x, inc_t incx,
dcomplex* restrict y, inc_t incy
)
{
/*
Template axpyf kernel implementation
@@ -243,14 +243,14 @@ void bli_zzzaxpyf_opt_var1(
// Call the reference implementation if needed.
if ( use_ref == TRUE )
{
bli_zzzaxpyf_unb_var1( conja,
conjx,
m,
b_n,
alpha,
a, inca, lda,
x, incx,
y, incy );
BLIS_ZAXPYF_KERNEL_REF( conja,
conjx,
m,
b_n,
alpha,
a, inca, lda,
x, incx,
y, incy );
return;
}
@@ -376,41 +376,3 @@ void bli_zzzaxpyf_opt_var1(
}
//
// Define BLAS-like interfaces with heterogeneous-typed operands.
//
#undef GENTFUNC3U12
#define GENTFUNC3U12( ctype_a, ctype_x, ctype_y, ctype_ax, cha, chx, chy, chax, varname, kername ) \
\
void PASTEMAC3(cha,chx,chy,varname)( \
conj_t conja, \
conj_t conjx, \
dim_t m, \
dim_t b_n, \
ctype_ax* restrict alpha, \
ctype_a* restrict a, inc_t inca, inc_t lda, \
ctype_x* restrict x, inc_t incx, \
ctype_y* restrict y, inc_t incy \
) \
{ \
/* Just call the reference implementation. */ \
PASTEMAC3(cha,chx,chy,kername)( conja, \
conjx, \
m, \
b_n, \
alpha, \
a, inca, lda, \
x, incx, \
y, incy ); \
}
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
INSERT_GENTFUNC3U12_MIX_D( axpyf_opt_var1, axpyf_unb_var1 )
#endif
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
INSERT_GENTFUNC3U12_MIX_P( axpyf_opt_var1, axpyf_unb_var1 )
#endif

View File

@@ -36,87 +36,87 @@
void bli_sssdotaxpyv_opt_var1( conj_t conjxt,
conj_t conjx,
conj_t conjy,
dim_t n,
float* restrict alpha,
float* restrict x, inc_t incx,
float* restrict y, inc_t incy,
float* restrict rho,
float* restrict z, inc_t incz )
void bli_sdotaxpyv_opt_var1( conj_t conjxt,
conj_t conjx,
conj_t conjy,
dim_t n,
float* restrict alpha,
float* restrict x, inc_t incx,
float* restrict y, inc_t incy,
float* restrict rho,
float* restrict z, inc_t incz )
{
/* Just call the reference implementation. */
bli_sssdotaxpyv_unb_var1( conjxt,
conjx,
conjy,
n,
alpha,
x, incx,
y, incy,
rho,
z, incz );
BLIS_SDOTAXPYV_KERNEL_REF( conjxt,
conjx,
conjy,
n,
alpha,
x, incx,
y, incy,
rho,
z, incz );
}
void bli_ddddotaxpyv_opt_var1( conj_t conjxt,
conj_t conjx,
conj_t conjy,
dim_t n,
double* restrict alpha,
double* restrict x, inc_t incx,
double* restrict y, inc_t incy,
double* restrict rho,
double* restrict z, inc_t incz )
void bli_ddotaxpyv_opt_var1( conj_t conjxt,
conj_t conjx,
conj_t conjy,
dim_t n,
double* restrict alpha,
double* restrict x, inc_t incx,
double* restrict y, inc_t incy,
double* restrict rho,
double* restrict z, inc_t incz )
{
/* Just call the reference implementation. */
bli_ddddotaxpyv_unb_var1( conjxt,
conjx,
conjy,
n,
alpha,
x, incx,
y, incy,
rho,
z, incz );
BLIS_DDOTAXPYV_KERNEL_REF( conjxt,
conjx,
conjy,
n,
alpha,
x, incx,
y, incy,
rho,
z, incz );
}
void bli_cccdotaxpyv_opt_var1( conj_t conjxt,
conj_t conjx,
conj_t conjy,
dim_t n,
scomplex* restrict alpha,
scomplex* restrict x, inc_t incx,
scomplex* restrict y, inc_t incy,
scomplex* restrict rho,
scomplex* restrict z, inc_t incz )
void bli_cdotaxpyv_opt_var1( conj_t conjxt,
conj_t conjx,
conj_t conjy,
dim_t n,
scomplex* restrict alpha,
scomplex* restrict x, inc_t incx,
scomplex* restrict y, inc_t incy,
scomplex* restrict rho,
scomplex* restrict z, inc_t incz )
{
/* Just call the reference implementation. */
bli_cccdotaxpyv_unb_var1( conjxt,
conjx,
conjy,
n,
alpha,
x, incx,
y, incy,
rho,
z, incz );
BLIS_CDOTAXPYV_KERNEL_REF( conjxt,
conjx,
conjy,
n,
alpha,
x, incx,
y, incy,
rho,
z, incz );
}
void bli_zzzdotaxpyv_opt_var1( conj_t conjxt,
conj_t conjx,
conj_t conjy,
dim_t n,
dcomplex* restrict alpha,
dcomplex* restrict x, inc_t incx,
dcomplex* restrict y, inc_t incy,
dcomplex* restrict rho,
dcomplex* restrict z, inc_t incz )
void bli_zdotaxpyv_opt_var1( conj_t conjxt,
conj_t conjx,
conj_t conjy,
dim_t n,
dcomplex* restrict alpha,
dcomplex* restrict x, inc_t incx,
dcomplex* restrict y, inc_t incy,
dcomplex* restrict rho,
dcomplex* restrict z, inc_t incz )
{
/*
Template dotaxpyv kernel implementation
@@ -240,15 +240,15 @@ void bli_zzzdotaxpyv_opt_var1( conj_t conjxt,
// Call the reference implementation if needed.
if ( use_ref == TRUE )
{
bli_zzzdotaxpyv_unb_var1( conjxt,
conjx,
conjy,
n,
alpha,
x, incx,
y, incy,
rho,
z, incz );
BLIS_ZDOTAXPYV_KERNEL_REF( conjxt,
conjx,
conjy,
n,
alpha,
x, incx,
y, incy,
rho,
z, incz );
return;
}
@@ -429,42 +429,3 @@ void bli_zzzdotaxpyv_opt_var1( conj_t conjxt,
bli_zzcopys( dotxy, *rho );
}
//
// Define BLAS-like interfaces with heterogeneous-typed operands.
//
#undef GENTFUNC3U12
#define GENTFUNC3U12( ctype_x, ctype_y, ctype_z, ctype_xy, chx, chy, chz, chxy, varname, kername ) \
\
void PASTEMAC3(chx,chy,chz,varname)( \
conj_t conjxt, \
conj_t conjx, \
conj_t conjy, \
dim_t n, \
ctype_x* restrict alpha, \
ctype_x* restrict x, inc_t incx, \
ctype_y* restrict y, inc_t incy, \
ctype_xy* restrict rho, \
ctype_z* restrict z, inc_t incz \
) \
{ \
/* Just call the reference implementation. */ \
PASTEMAC3(chx,chy,chz,kername)( conjxt, \
conjx, \
conjy, \
n, \
alpha, \
x, incx, \
y, incy, \
rho, \
z, incz ); \
}
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
INSERT_GENTFUNC3U12_MIX_D( dotaxpyv_opt_var1, dotaxpyv_unb_var1 )
#endif
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
INSERT_GENTFUNC3U12_MIX_P( dotaxpyv_opt_var1, dotaxpyv_unb_var1 )
#endif

View File

@@ -36,115 +36,115 @@
void bli_sssdotxaxpyf_opt_var1( conj_t conjat,
conj_t conja,
conj_t conjw,
conj_t conjx,
dim_t m,
dim_t b_n,
float* restrict alpha,
float* restrict a, inc_t inca, inc_t lda,
float* restrict w, inc_t incw,
float* restrict x, inc_t incx,
float* restrict beta,
float* restrict y, inc_t incy,
float* restrict z, inc_t incz )
void bli_sdotxaxpyf_opt_var1( conj_t conjat,
conj_t conja,
conj_t conjw,
conj_t conjx,
dim_t m,
dim_t b_n,
float* restrict alpha,
float* restrict a, inc_t inca, inc_t lda,
float* restrict w, inc_t incw,
float* restrict x, inc_t incx,
float* restrict beta,
float* restrict y, inc_t incy,
float* restrict z, inc_t incz )
{
/* Just call the reference implementation. */
bli_sssdotxaxpyf_unb_var1( conjat,
conja,
conjw,
conjx,
m,
b_n,
alpha,
a, inca, lda,
w, incw,
x, incx,
beta,
y, incy,
z, incz );
BLIS_SDOTXAXPYF_KERNEL_REF( conjat,
conja,
conjw,
conjx,
m,
b_n,
alpha,
a, inca, lda,
w, incw,
x, incx,
beta,
y, incy,
z, incz );
}
void bli_ddddotxaxpyf_opt_var1( conj_t conjat,
conj_t conja,
conj_t conjw,
conj_t conjx,
dim_t m,
dim_t b_n,
double* restrict alpha,
double* restrict a, inc_t inca, inc_t lda,
double* restrict w, inc_t incw,
double* restrict x, inc_t incx,
double* restrict beta,
double* restrict y, inc_t incy,
double* restrict z, inc_t incz )
void bli_ddotxaxpyf_opt_var1( conj_t conjat,
conj_t conja,
conj_t conjw,
conj_t conjx,
dim_t m,
dim_t b_n,
double* restrict alpha,
double* restrict a, inc_t inca, inc_t lda,
double* restrict w, inc_t incw,
double* restrict x, inc_t incx,
double* restrict beta,
double* restrict y, inc_t incy,
double* restrict z, inc_t incz )
{
/* Just call the reference implementation. */
bli_ddddotxaxpyf_unb_var1( conjat,
conja,
conjw,
conjx,
m,
b_n,
alpha,
a, inca, lda,
w, incw,
x, incx,
beta,
y, incy,
z, incz );
BLIS_DDOTXAXPYF_KERNEL_REF( conjat,
conja,
conjw,
conjx,
m,
b_n,
alpha,
a, inca, lda,
w, incw,
x, incx,
beta,
y, incy,
z, incz );
}
void bli_cccdotxaxpyf_opt_var1( conj_t conjat,
conj_t conja,
conj_t conjw,
conj_t conjx,
dim_t m,
dim_t b_n,
scomplex* restrict alpha,
scomplex* restrict a, inc_t inca, inc_t lda,
scomplex* restrict w, inc_t incw,
scomplex* restrict x, inc_t incx,
scomplex* restrict beta,
scomplex* restrict y, inc_t incy,
scomplex* restrict z, inc_t incz )
void bli_cdotxaxpyf_opt_var1( conj_t conjat,
conj_t conja,
conj_t conjw,
conj_t conjx,
dim_t m,
dim_t b_n,
scomplex* restrict alpha,
scomplex* restrict a, inc_t inca, inc_t lda,
scomplex* restrict w, inc_t incw,
scomplex* restrict x, inc_t incx,
scomplex* restrict beta,
scomplex* restrict y, inc_t incy,
scomplex* restrict z, inc_t incz )
{
/* Just call the reference implementation. */
bli_cccdotxaxpyf_unb_var1( conjat,
conja,
conjw,
conjx,
m,
b_n,
alpha,
a, inca, lda,
w, incw,
x, incx,
beta,
y, incy,
z, incz );
BLIS_CDOTXAXPYF_KERNEL_REF( conjat,
conja,
conjw,
conjx,
m,
b_n,
alpha,
a, inca, lda,
w, incw,
x, incx,
beta,
y, incy,
z, incz );
}
void bli_zzzdotxaxpyf_opt_var1( conj_t conjat,
conj_t conja,
conj_t conjw,
conj_t conjx,
dim_t m,
dim_t b_n,
dcomplex* restrict alpha,
dcomplex* restrict a, inc_t inca, inc_t lda,
dcomplex* restrict w, inc_t incw,
dcomplex* restrict x, inc_t incx,
dcomplex* restrict beta,
dcomplex* restrict y, inc_t incy,
dcomplex* restrict z, inc_t incz )
void bli_zdotxaxpyf_opt_var1( conj_t conjat,
conj_t conja,
conj_t conjw,
conj_t conjx,
dim_t m,
dim_t b_n,
dcomplex* restrict alpha,
dcomplex* restrict a, inc_t inca, inc_t lda,
dcomplex* restrict w, inc_t incw,
dcomplex* restrict x, inc_t incx,
dcomplex* restrict beta,
dcomplex* restrict y, inc_t incy,
dcomplex* restrict z, inc_t incz )
{
/*
@@ -289,19 +289,19 @@ void bli_zzzdotxaxpyf_opt_var1( conj_t conjat,
// Call the reference implementation if needed.
if ( use_ref == TRUE )
{
bli_zzzdotxaxpyf_unb_var1( conjat,
conja,
conjw,
conjx,
m,
b_n,
alpha,
a, inca, lda,
w, incw,
x, incx,
beta,
y, incy,
z, incz );
BLIS_ZDOTXAXPYF_KERNEL_REF( conjat,
conja,
conjw,
conjx,
m,
b_n,
alpha,
a, inca, lda,
w, incw,
x, incx,
beta,
y, incy,
z, incz );
return;
}
@@ -560,51 +560,3 @@ void bli_zzzdotxaxpyf_opt_var1( conj_t conjat,
}
}
//
// Define BLAS-like interfaces with heterogeneous-typed operands.
//
#undef GENTFUNC3U12
#define GENTFUNC3U12( ctype_a, ctype_b, ctype_c, ctype_ab, cha, chb, chc, chab, varname, kername ) \
\
void PASTEMAC3(cha,chb,chc,varname)( \
conj_t conjat, \
conj_t conja, \
conj_t conjw, \
conj_t conjx, \
dim_t m, \
dim_t b_n, \
ctype_ab* restrict alpha, \
ctype_a* restrict a, inc_t inca, inc_t lda, \
ctype_b* restrict w, inc_t incw, \
ctype_b* restrict x, inc_t incx, \
ctype_c* restrict beta, \
ctype_c* restrict y, inc_t incy, \
ctype_c* restrict z, inc_t incz \
) \
{ \
/* Just call the reference implementation. */ \
PASTEMAC3(cha,chx,chy,kername)( conjat, \
conja, \
conjw, \
conjx, \
m, \
b_n, \
alpha, \
a, inca, lda, \
w, incw, \
x, incx, \
beta, \
y, incy, \
z, incz ); \
}
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
INSERT_GENTFUNC3U12_MIX_D( dotxaxpyf_opt_var1, dotxaxpyf_unb_var1 )
#endif
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
INSERT_GENTFUNC3U12_MIX_P( dotxaxpyf_opt_var1, dotxaxpyf_unb_var1 )
#endif

View File

@@ -1,67 +0,0 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Prototype kernel interfaces.
//
#undef GENTPROT3U12
#define GENTPROT3U12( ctype_a, ctype_b, ctype_c, ctype_ab, cha, chb, chc, chab, varname ) \
\
void PASTEMAC3(cha,chb,chc,varname)( \
conj_t conjat, \
conj_t conja, \
conj_t conjw, \
conj_t conjx, \
dim_t m, \
dim_t b_n, \
ctype_ab* restrict alpha, \
ctype_a* restrict a, inc_t inca, inc_t lda, \
ctype_b* restrict w, inc_t incw, \
ctype_b* restrict x, inc_t incx, \
ctype_c* restrict beta, \
ctype_c* restrict y, inc_t incy, \
ctype_c* restrict z, inc_t incz \
);
INSERT_GENTPROT3U12_BASIC( dotxaxpyf_opt_var1 )
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
INSERT_GENTPROT3U12_MIX_D( dotxaxpyf_opt_var1 )
#endif
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
INSERT_GENTPROT3U12_MIX_P( dotxaxpyf_opt_var1 )
#endif

View File

@@ -36,95 +36,95 @@
void bli_sssdotxf_opt_var1(
conj_t conjat,
conj_t conjx,
dim_t m,
dim_t b_n,
float* restrict alpha,
float* restrict a, inc_t inca, inc_t lda,
float* restrict x, inc_t incx,
float* restrict beta,
float* restrict y, inc_t incy
)
void bli_sdotxf_opt_var1(
conj_t conjat,
conj_t conjx,
dim_t m,
dim_t b_n,
float* restrict alpha,
float* restrict a, inc_t inca, inc_t lda,
float* restrict x, inc_t incx,
float* restrict beta,
float* restrict y, inc_t incy
)
{
/* Just call the reference implementation. */
bli_sssdotxf_unb_var1( conjat,
conjx,
m,
b_n,
alpha,
a, inca, lda,
x, incx,
beta,
y, incy );
BLIS_SDOTXF_KERNEL_REF( conjat,
conjx,
m,
b_n,
alpha,
a, inca, lda,
x, incx,
beta,
y, incy );
}
void bli_ddddotxf_opt_var1(
conj_t conjat,
conj_t conjx,
dim_t m,
dim_t b_n,
double* restrict alpha,
double* restrict a, inc_t inca, inc_t lda,
double* restrict x, inc_t incx,
double* restrict beta,
double* restrict y, inc_t incy
)
void bli_ddotxf_opt_var1(
conj_t conjat,
conj_t conjx,
dim_t m,
dim_t b_n,
double* restrict alpha,
double* restrict a, inc_t inca, inc_t lda,
double* restrict x, inc_t incx,
double* restrict beta,
double* restrict y, inc_t incy
)
{
/* Just call the reference implementation. */
bli_ddddotxf_unb_var1( conjat,
conjx,
m,
b_n,
alpha,
a, inca, lda,
x, incx,
beta,
y, incy );
BLIS_DDOTXF_KERNEL_REF( conjat,
conjx,
m,
b_n,
alpha,
a, inca, lda,
x, incx,
beta,
y, incy );
}
void bli_cccdotxf_opt_var1(
conj_t conjat,
conj_t conjx,
dim_t m,
dim_t b_n,
scomplex* restrict alpha,
scomplex* restrict a, inc_t inca, inc_t lda,
scomplex* restrict x, inc_t incx,
scomplex* restrict beta,
scomplex* restrict y, inc_t incy
)
void bli_cdotxf_opt_var1(
conj_t conjat,
conj_t conjx,
dim_t m,
dim_t b_n,
scomplex* restrict alpha,
scomplex* restrict a, inc_t inca, inc_t lda,
scomplex* restrict x, inc_t incx,
scomplex* restrict beta,
scomplex* restrict y, inc_t incy
)
{
/* Just call the reference implementation. */
bli_cccdotxf_unb_var1( conjat,
conjx,
m,
b_n,
alpha,
a, inca, lda,
x, incx,
beta,
y, incy );
BLIS_CDOTXF_KERNEL_REF( conjat,
conjx,
m,
b_n,
alpha,
a, inca, lda,
x, incx,
beta,
y, incy );
}
void bli_zzzdotxf_opt_var1(
conj_t conjat,
conj_t conjx,
dim_t m,
dim_t b_n,
dcomplex* restrict alpha,
dcomplex* restrict a, inc_t inca, inc_t lda,
dcomplex* restrict x, inc_t incx,
dcomplex* restrict beta,
dcomplex* restrict y, inc_t incy
)
void bli_zdotxf_opt_var1(
conj_t conjat,
conj_t conjx,
dim_t m,
dim_t b_n,
dcomplex* restrict alpha,
dcomplex* restrict a, inc_t inca, inc_t lda,
dcomplex* restrict x, inc_t incx,
dcomplex* restrict beta,
dcomplex* restrict y, inc_t incy
)
{
/*
Template dotxf kernel implementation
@@ -265,15 +265,15 @@ void bli_zzzdotxf_opt_var1(
// Call the reference implementation if needed.
if ( use_ref == TRUE )
{
bli_zzzdotxf_unb_var1( conjat,
conjx,
m,
b_n,
alpha,
a, inca, lda,
x, incx,
beta,
y, incy );
BLIS_ZDOTXF_KERNEL_REF( conjat,
conjx,
m,
b_n,
alpha,
a, inca, lda,
x, incx,
beta,
y, incy );
return;
}
@@ -414,43 +414,3 @@ void bli_zzzdotxf_opt_var1(
}
}
//
// Define BLAS-like interfaces with heterogeneous-typed operands.
//
#undef GENTFUNC3U12
#define GENTFUNC3U12( ctype_x, ctype_y, ctype_r, ctype_xy, chx, chy, chr, chxy, varname, kername ) \
\
void PASTEMAC3(chx,chy,chr,varname)( \
conj_t conjat, \
conj_t conjx, \
dim_t m, \
dim_t b_n, \
ctype_xy* restrict alpha, \
ctype_x* restrict a, inc_t inca, inc_t lda, \
ctype_y* restrict x, inc_t incx, \
ctype_r* restrict beta, \
ctype_r* restrict y, inc_t incy \
) \
{ \
/* Just call the reference implementation. */ \
PASTEMAC3(cha,chx,chy,kername)( conjat, \
conjx, \
m, \
b_n, \
alpha, \
a, inca, lda, \
x, incx, \
beta, \
y, incy ); \
}
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
INSERT_GENTFUNC3U12_MIX_D( dotxf_opt_var1, dotxf_unb_var1 )
#endif
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
INSERT_GENTFUNC3U12_MIX_P( dotxf_opt_var1, dotxf_unb_var1 )
#endif

View File

@@ -1,63 +0,0 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Prototype kernel interfaces.
//
#undef GENTPROT3U12
#define GENTPROT3U12( ctype_a, ctype_x, ctype_y, ctype_ax, cha, chx, chy, chax, varname ) \
\
void PASTEMAC3(cha,chx,chy,varname)( \
conj_t conjat, \
conj_t conjx, \
dim_t m, \
dim_t b_n, \
ctype_ax* restrict alpha, \
ctype_a* restrict a, inc_t inca, inc_t lda, \
ctype_x* restrict x, inc_t incx, \
ctype_y* restrict beta, \
ctype_y* restrict y, inc_t incy \
);
INSERT_GENTPROT3U12_BASIC( dotxf_opt_var1 )
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
INSERT_GENTPROT3U12_MIX_D( dotxf_opt_var1 )
#endif
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
INSERT_GENTPROT3U12_MIX_P( dotxf_opt_var1 )
#endif

View File

@@ -47,7 +47,7 @@ void bli_sgemm_opt_mxn(
)
{
/* Just call the reference implementation. */
bli_sgemm_ref_mxn( k,
BLIS_SGEMM_UKERNEL_REF( k,
alpha,
a1,
b1,
@@ -275,7 +275,7 @@ void bli_cgemm_opt_mxn(
)
{
/* Just call the reference implementation. */
bli_cgemm_ref_mxn( k,
BLIS_CGEMM_UKERNEL_REF( k,
alpha,
a1,
b1,
@@ -297,7 +297,7 @@ void bli_zgemm_opt_mxn(
)
{
/* Just call the reference implementation. */
bli_zgemm_ref_mxn( k,
BLIS_ZGEMM_UKERNEL_REF( k,
alpha,
a1,
b1,

View File

@@ -1,53 +0,0 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Prototype micro-kernel interfaces.
//
#undef GENTPROT
#define GENTPROT( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname)( \
dim_t k, \
ctype* restrict alpha, \
ctype* restrict a1, \
ctype* restrict b1, \
ctype* restrict beta, \
ctype* restrict c11, inc_t rs_c, inc_t cs_c, \
auxinfo_t* data \
);
INSERT_GENTPROT_BASIC( gemm_opt_mxn )

View File

@@ -1,54 +0,0 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Prototype micro-kernel interfaces.
//
#undef GENTPROT
#define GENTPROT( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname)( \
dim_t k, \
ctype* restrict alpha, \
ctype* restrict a10, \
ctype* restrict a11, \
ctype* restrict b01, \
ctype* restrict b11, \
ctype* restrict c11, inc_t rs_c, inc_t cs_c, \
auxinfo_t* data \
);
INSERT_GENTPROT_BASIC( gemmtrsm_l_opt_mxn )

View File

@@ -44,7 +44,7 @@ void bli_strsm_l_opt_mxn(
)
{
/* Just call the reference implementation. */
bli_strsm_l_ref_mxn( a11,
BLIS_STRSM_L_UKERNEL_REF( a11,
b11,
c11, rs_c, cs_c,
data );
@@ -216,7 +216,7 @@ void bli_ctrsm_l_opt_mxn(
)
{
/* Just call the reference implementation. */
bli_ctrsm_l_ref_mxn( a11,
BLIS_CTRSM_L_UKERNEL_REF( a11,
b11,
c11, rs_c, cs_c,
data );
@@ -232,7 +232,7 @@ void bli_ztrsm_l_opt_mxn(
)
{
/* Just call the reference implementation. */
bli_ztrsm_l_ref_mxn( a11,
BLIS_ZTRSM_L_UKERNEL_REF( a11,
b11,
c11, rs_c, cs_c,
data );

View File

@@ -37,25 +37,25 @@
void bli_strsm_u_opt_mxn(
float* restrict a,
float* restrict b,
float* restrict c, inc_t rs_c, inc_t cs_c,
float* restrict a11,
float* restrict b11,
float* restrict c11, inc_t rs_c, inc_t cs_c,
auxinfo_t* data
)
{
/* Just call the reference implementation. */
bli_strsm_u_ref_mxn( a,
b,
c, rs_c, cs_c,
BLIS_STRSM_U_UKERNEL_REF( a11,
b11,
c11, rs_c, cs_c,
data );
}
void bli_dtrsm_u_opt_mxn(
double* restrict a,
double* restrict b,
double* restrict c, inc_t rs_c, inc_t cs_c,
double* restrict a11,
double* restrict b11,
double* restrict c11, inc_t rs_c, inc_t cs_c,
auxinfo_t* data
)
{
@@ -168,18 +168,18 @@ void bli_dtrsm_u_opt_mxn(
{
i = m - iter - 1;
n_behind = iter;
alpha11 = a + (i )*rs_a + (i )*cs_a;
a12t = a + (i )*rs_a + (i+1)*cs_a;
x1 = b + (i )*rs_b + (0 )*cs_b;
X2 = b + (i+1)*rs_b + (0 )*cs_b;
alpha11 = a11 + (i )*rs_a + (i )*cs_a;
a12t = a11 + (i )*rs_a + (i+1)*cs_a;
x1 = b11 + (i )*rs_b + (0 )*cs_b;
X2 = b11 + (i+1)*rs_b + (0 )*cs_b;
/* x1 = x1 - a12t * X2; */
/* x1 = x1 / alpha11; */
for ( j = 0; j < n; ++j )
{
chi11 = x1 + (0 )*rs_b + (j )*cs_b;
x21 = X2 + (0 )*rs_b + (j )*cs_b;
gamma11 = c + (i )*rs_c + (j )*cs_c;
chi11 = x1 + (0 )*rs_b + (j )*cs_b;
x21 = X2 + (0 )*rs_b + (j )*cs_b;
gamma11 = c11 + (i )*rs_c + (j )*cs_c;
/* chi11 = chi11 - a12t * x21; */
bli_dset0s( rho11 );
@@ -208,32 +208,32 @@ void bli_dtrsm_u_opt_mxn(
void bli_ctrsm_u_opt_mxn(
scomplex* restrict a,
scomplex* restrict b,
scomplex* restrict c, inc_t rs_c, inc_t cs_c,
scomplex* restrict a11,
scomplex* restrict b11,
scomplex* restrict c11, inc_t rs_c, inc_t cs_c,
auxinfo_t* data
)
{
/* Just call the reference implementation. */
bli_ctrsm_u_ref_mxn( a,
b,
c, rs_c, cs_c,
BLIS_CTRSM_U_UKERNEL_REF( a11,
b11,
c11, rs_c, cs_c,
data );
}
void bli_ztrsm_u_opt_mxn(
dcomplex* restrict a,
dcomplex* restrict b,
dcomplex* restrict c, inc_t rs_c, inc_t cs_c,
dcomplex* restrict a11,
dcomplex* restrict b11,
dcomplex* restrict c11, inc_t rs_c, inc_t cs_c,
auxinfo_t* data
)
{
/* Just call the reference implementation. */
bli_ztrsm_u_ref_mxn( a,
b,
c, rs_c, cs_c,
BLIS_ZTRSM_U_UKERNEL_REF( a11,
b11,
c11, rs_c, cs_c,
data );
}

View File

@@ -1,50 +0,0 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Prototype micro-kernel interfaces.
//
#undef GENTPROT
#define GENTPROT( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname)( \
ctype* restrict a, \
ctype* restrict b, \
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
auxinfo_t* data \
);
INSERT_GENTPROT_BASIC( trsm_u_opt_mxn )

View File

@@ -53,7 +53,7 @@ void PASTEMAC0(opname)( \
y ); \
}
GENFRONT( addv, ADDV_KERNEL )
GENFRONT( addv, addv_kernel )
//

View File

@@ -33,7 +33,8 @@
*/
#include "bli_addv_check.h"
#include "bli_addv_unb_var1.h"
#include "bli_addv_kernel.h"
#include "bli_addv_ref.h"
//

View File

@@ -0,0 +1,115 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#define FUNCPTR_T addv_fp
typedef void (*FUNCPTR_T)(
conj_t conjx,
dim_t n,
void* x, inc_t incx,
void* y, inc_t incy
);
// If some mixed datatype functions will not be compiled, we initialize
// the corresponding elements of the function array to NULL.
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
static FUNCPTR_T GENARRAY2_ALL(ftypes,addv_kernel_void);
#else
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
static FUNCPTR_T GENARRAY2_EXT(ftypes,addv_kernel_void);
#else
static FUNCPTR_T GENARRAY2_MIN(ftypes,addv_kernel_void);
#endif
#endif
void bli_addv_kernel( obj_t* x,
obj_t* y )
{
num_t dt_x = bli_obj_datatype( *x );
num_t dt_y = bli_obj_datatype( *y );
conj_t conjx = bli_obj_conj_status( *x );
dim_t n = bli_obj_vector_dim( *x );
inc_t inc_x = bli_obj_vector_inc( *x );
void* buf_x = bli_obj_buffer_at_off( *x );
inc_t inc_y = bli_obj_vector_inc( *y );
void* buf_y = bli_obj_buffer_at_off( *y );
FUNCPTR_T f;
// Index into the type combination array to extract the correct
// function pointer.
f = ftypes[dt_x][dt_y];
// Invoke the function.
f( conjx,
n,
buf_x, inc_x,
buf_y, inc_y );
}
#undef GENTFUNC2
#define GENTFUNC2( ctype_x, ctype_y, chx, chy, varname, kername ) \
\
void PASTEMAC2(chx,chy,varname)( \
conj_t conjx, \
dim_t n, \
void* x, inc_t incx, \
void* y, inc_t incy \
) \
{ \
PASTEMAC2(chx,chy,kername)( conjx, \
n, \
x, incx, \
y, incy ); \
}
// Define the basic set of functions unconditionally, and then also some
// mixed datatype functions if requested.
INSERT_GENTFUNC2_BASIC( addv_kernel_void, ADDV_KERNEL )
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
INSERT_GENTFUNC2_MIX_D( addv_kernel_void, ADDV_KERNEL )
#endif
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
INSERT_GENTFUNC2_MIX_P( addv_kernel_void, ADDV_KERNEL )
#endif

View File

@@ -32,10 +32,14 @@
*/
void bli_subv_unb_var1( obj_t* x,
obj_t* y );
void bli_addv_kernel( obj_t* x,
obj_t* y );
//
// Prototype the void pointer kernel wrappers.
//
#undef GENTPROT2
#define GENTPROT2( ctype_x, ctype_y, chx, chy, varname ) \
\
@@ -46,12 +50,13 @@ void PASTEMAC2(chx,chy,varname)( \
void* y, inc_t incy \
);
INSERT_GENTPROT2_BASIC( subv_unb_var1 )
INSERT_GENTPROT2_BASIC( addv_kernel_void )
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
INSERT_GENTPROT2_MIX_D( subv_unb_var1 )
INSERT_GENTPROT2_MIX_D( addv_kernel_void )
#endif
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
INSERT_GENTPROT2_MIX_P( subv_unb_var1 )
INSERT_GENTPROT2_MIX_P( addv_kernel_void )
#endif

View File

@@ -34,6 +34,7 @@
#include "blis.h"
/*
#define FUNCPTR_T addv_fp
typedef void (*FUNCPTR_T)(
@@ -46,17 +47,17 @@ typedef void (*FUNCPTR_T)(
// If some mixed datatype functions will not be compiled, we initialize
// the corresponding elements of the function array to NULL.
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
static FUNCPTR_T GENARRAY2_ALL(ftypes,addv_unb_var1);
static FUNCPTR_T GENARRAY2_ALL(ftypes,addv_ref);
#else
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
static FUNCPTR_T GENARRAY2_EXT(ftypes,addv_unb_var1);
static FUNCPTR_T GENARRAY2_EXT(ftypes,addv_ref);
#else
static FUNCPTR_T GENARRAY2_MIN(ftypes,addv_unb_var1);
static FUNCPTR_T GENARRAY2_MIN(ftypes,addv_ref);
#endif
#endif
void bli_addv_unb_var1( obj_t* x,
void bli_addv_ref( obj_t* x,
obj_t* y )
{
num_t dt_x = bli_obj_datatype( *x );
@@ -83,17 +84,19 @@ void bli_addv_unb_var1( obj_t* x,
buf_x, inc_x,
buf_y, inc_y );
}
*/
#undef GENTFUNC2
#define GENTFUNC2( ctype_x, ctype_y, chx, chy, opname, varname ) \
\
void PASTEMAC2(chx,chy,varname)( \
conj_t conjx, \
dim_t n, \
void* x, inc_t incx, \
void* y, inc_t incy \
) \
void PASTEMAC2(chx,chy,varname) \
( \
conj_t conjx, \
dim_t n, \
ctype_x* restrict x, inc_t incx, \
ctype_y* restrict y, inc_t incy \
) \
{ \
ctype_x* x_cast = x; \
ctype_y* y_cast = y; \
@@ -130,13 +133,13 @@ void PASTEMAC2(chx,chy,varname)( \
// Define the basic set of functions unconditionally, and then also some
// mixed datatype functions if requested.
INSERT_GENTFUNC2_BASIC( addv, addv_unb_var1 )
INSERT_GENTFUNC2_BASIC( addv, addv_ref )
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
INSERT_GENTFUNC2_MIX_D( addv, addv_unb_var1 )
INSERT_GENTFUNC2_MIX_D( addv, addv_ref )
#endif
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
INSERT_GENTFUNC2_MIX_P( addv, addv_unb_var1 )
INSERT_GENTFUNC2_MIX_P( addv, addv_ref )
#endif

View File

@@ -0,0 +1,60 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/*
void bli_addv_ref( obj_t* x,
obj_t* y );
*/
#undef GENTPROT2
#define GENTPROT2( ctype_x, ctype_y, chx, chy, varname ) \
\
void PASTEMAC2(chx,chy,varname) \
( \
conj_t conjx, \
dim_t n, \
ctype_x* restrict x, inc_t incx, \
ctype_y* restrict y, inc_t incy \
);
INSERT_GENTPROT2_BASIC( addv_ref )
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
INSERT_GENTPROT2_MIX_D( addv_ref )
#endif
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
INSERT_GENTPROT2_MIX_P( addv_ref )
#endif

View File

@@ -68,7 +68,7 @@ void PASTEMAC0(opname)( \
y ); \
}
GENFRONT( axpyv, AXPYV_KERNEL )
GENFRONT( axpyv, axpyv_kernel )
//

View File

@@ -33,7 +33,8 @@
*/
#include "bli_axpyv_check.h"
#include "bli_axpyv_unb_var1.h"
#include "bli_axpyv_kernel.h"
#include "bli_axpyv_ref.h"
//

View File

@@ -0,0 +1,128 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#define FUNCPTR_T axpyv_fp
typedef void (*FUNCPTR_T)(
conj_t conjx,
dim_t n,
void* alpha,
void* x, inc_t incx,
void* y, inc_t incy
);
// If some mixed datatype functions will not be compiled, we initialize
// the corresponding elements of the function array to NULL.
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
static FUNCPTR_T GENARRAY3_ALL(ftypes,axpyv_kernel_void);
#else
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
static FUNCPTR_T GENARRAY3_EXT(ftypes,axpyv_kernel_void);
#else
static FUNCPTR_T GENARRAY3_MIN(ftypes,axpyv_kernel_void);
#endif
#endif
void bli_axpyv_kernel( obj_t* alpha,
obj_t* x,
obj_t* y )
{
num_t dt_x = bli_obj_datatype( *x );
num_t dt_y = bli_obj_datatype( *y );
conj_t conjx = bli_obj_conj_status( *x );
dim_t n = bli_obj_vector_dim( *x );
inc_t inc_x = bli_obj_vector_inc( *x );
void* buf_x = bli_obj_buffer_at_off( *x );
inc_t inc_y = bli_obj_vector_inc( *y );
void* buf_y = bli_obj_buffer_at_off( *y );
num_t dt_alpha;
void* buf_alpha;
FUNCPTR_T f;
// If alpha is a scalar constant, use dt_x to extract the address of the
// corresponding constant value; otherwise, use the datatype encoded
// within the alpha object and extract the buffer at the alpha offset.
bli_set_scalar_dt_buffer( alpha, dt_x, dt_alpha, buf_alpha );
// Index into the type combination array to extract the correct
// function pointer.
f = ftypes[dt_alpha][dt_x][dt_y];
// Invoke the function.
f( conjx,
n,
buf_alpha,
buf_x, inc_x,
buf_y, inc_y );
}
#undef GENTFUNC3
#define GENTFUNC3( ctype_a, ctype_x, ctype_y, cha, chx, chy, varname, kername ) \
\
void PASTEMAC3(cha,chx,chy,varname)( \
conj_t conjx, \
dim_t n, \
void* alpha, \
void* x, inc_t incx, \
void* y, inc_t incy \
) \
{ \
PASTEMAC3(cha,chx,chy,kername)( conjx, \
n, \
alpha, \
x, incx, \
y, incy ); \
}
// Define the basic set of functions unconditionally, and then also some
// mixed datatype functions if requested.
INSERT_GENTFUNC3_BASIC( axpyv_kernel_void, AXPYV_KERNEL )
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
INSERT_GENTFUNC3_MIX_D( axpyv_kernel_void, AXPYV_KERNEL )
#endif
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
INSERT_GENTFUNC3_MIX_P( axpyv_kernel_void, AXPYV_KERNEL )
#endif

View File

@@ -32,11 +32,15 @@
*/
void bli_axpyv_unb_var1( obj_t* alpha,
obj_t* x,
obj_t* y );
void bli_axpyv_kernel( obj_t* alpha,
obj_t* x,
obj_t* y );
//
// Prototype the void pointer kernel wrappers.
//
#undef GENTPROT3
#define GENTPROT3( ctype_a, ctype_x, ctype_y, cha, chx, chy, varname ) \
\
@@ -48,13 +52,13 @@ void PASTEMAC3(cha,chx,chy,varname)( \
void* y, inc_t incy \
);
INSERT_GENTPROT3_BASIC( axpyv_unb_var1 )
INSERT_GENTPROT3_BASIC( axpyv_kernel_void )
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
INSERT_GENTPROT3_MIX_D( axpyv_unb_var1 )
INSERT_GENTPROT3_MIX_D( axpyv_kernel_void )
#endif
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
INSERT_GENTPROT3_MIX_P( axpyv_unb_var1 )
INSERT_GENTPROT3_MIX_P( axpyv_kernel_void )
#endif

View File

@@ -33,7 +33,7 @@
*/
#include "blis.h"
/*
#define FUNCPTR_T axpyv_fp
typedef void (*FUNCPTR_T)(
@@ -47,17 +47,17 @@ typedef void (*FUNCPTR_T)(
// If some mixed datatype functions will not be compiled, we initialize
// the corresponding elements of the function array to NULL.
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
static FUNCPTR_T GENARRAY3_ALL(ftypes,axpyv_unb_var1);
static FUNCPTR_T GENARRAY3_ALL(ftypes,axpyv_ref);
#else
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
static FUNCPTR_T GENARRAY3_EXT(ftypes,axpyv_unb_var1);
static FUNCPTR_T GENARRAY3_EXT(ftypes,axpyv_ref);
#else
static FUNCPTR_T GENARRAY3_MIN(ftypes,axpyv_unb_var1);
static FUNCPTR_T GENARRAY3_MIN(ftypes,axpyv_ref);
#endif
#endif
void bli_axpyv_unb_var1( obj_t* alpha,
void bli_axpyv_ref( obj_t* alpha,
obj_t* x,
obj_t* y )
{
@@ -94,18 +94,19 @@ void bli_axpyv_unb_var1( obj_t* alpha,
buf_x, inc_x,
buf_y, inc_y );
}
*/
#undef GENTFUNC3
#define GENTFUNC3( ctype_a, ctype_x, ctype_y, cha, chx, chy, varname, addvker ) \
\
void PASTEMAC3(cha,chx,chy,varname)( \
conj_t conjx, \
dim_t n, \
void* alpha, \
void* x, inc_t incx, \
void* y, inc_t incy \
) \
void PASTEMAC3(cha,chx,chy,varname) \
( \
conj_t conjx, \
dim_t n, \
ctype_a* restrict alpha, \
ctype_x* restrict x, inc_t incx, \
ctype_y* restrict y, inc_t incy \
) \
{ \
ctype_a* alpha_cast = alpha; \
ctype_x* x_cast = x; \
@@ -156,13 +157,13 @@ void PASTEMAC3(cha,chx,chy,varname)( \
// Define the basic set of functions unconditionally, and then also some
// mixed datatype functions if requested.
INSERT_GENTFUNC3_BASIC( axpyv_unb_var1, ADDV_KERNEL )
INSERT_GENTFUNC3_BASIC( axpyv_ref, ADDV_KERNEL )
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
INSERT_GENTFUNC3_MIX_D( axpyv_unb_var1, ADDV_KERNEL )
INSERT_GENTFUNC3_MIX_D( axpyv_ref, ADDV_KERNEL )
#endif
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
INSERT_GENTFUNC3_MIX_P( axpyv_unb_var1, ADDV_KERNEL )
INSERT_GENTFUNC3_MIX_P( axpyv_ref, ADDV_KERNEL )
#endif

View File

@@ -32,28 +32,32 @@
*/
/*
void bli_axpyv_ref( obj_t* alpha,
obj_t* x,
obj_t* y );
*/
//
// Prototype axpyv kernel interfaces.
//
#undef GENTPROT3
#define GENTPROT3( ctype_a, ctype_x, ctype_y, cha, chx, chy, varname ) \
\
void PASTEMAC3(cha,chx,chy,varname)( \
conj_t conjx, \
dim_t n, \
ctype_a* restrict alpha, \
ctype_x* restrict x, inc_t incx, \
ctype_y* restrict y, inc_t incy \
);
void PASTEMAC3(cha,chx,chy,varname) \
( \
conj_t conjx, \
dim_t n, \
ctype_a* restrict alpha, \
ctype_x* restrict x, inc_t incx, \
ctype_y* restrict y, inc_t incy \
);
INSERT_GENTPROT3_BASIC( axpyv_opt_var1 )
INSERT_GENTPROT3_BASIC( axpyv_ref )
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
INSERT_GENTPROT3_MIX_D( axpyv_opt_var1 )
INSERT_GENTPROT3_MIX_D( axpyv_ref )
#endif
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
INSERT_GENTPROT3_MIX_P( axpyv_opt_var1 )
INSERT_GENTPROT3_MIX_P( axpyv_ref )
#endif

View File

@@ -34,16 +34,6 @@
#include "blis.h"
/*
void bli_copyv( obj_t* x,
obj_t* y )
{
if ( bli_error_checking_is_enabled() )
bli_copyv_check( x, y );
bli_copyv_unb_var1( x, y );
}
*/
//
// Define object-based interface.
@@ -63,7 +53,7 @@ void PASTEMAC0(opname)( \
y ); \
}
GENFRONT( copyv, COPYV_KERNEL )
GENFRONT( copyv, copyv_kernel )
//

View File

@@ -33,7 +33,8 @@
*/
#include "bli_copyv_check.h"
#include "bli_copyv_unb_var1.h"
#include "bli_copyv_kernel.h"
#include "bli_copyv_ref.h"
//

View File

@@ -0,0 +1,115 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#define FUNCPTR_T copyv_fp
typedef void (*FUNCPTR_T)(
conj_t conjx,
dim_t n,
void* x, inc_t incx,
void* y, inc_t incy
);
// If some mixed datatype functions will not be compiled, we initialize
// the corresponding elements of the function array to NULL.
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
static FUNCPTR_T GENARRAY2_ALL(ftypes,copyv_kernel_void);
#else
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
static FUNCPTR_T GENARRAY2_EXT(ftypes,copyv_kernel_void);
#else
static FUNCPTR_T GENARRAY2_MIN(ftypes,copyv_kernel_void);
#endif
#endif
void bli_copyv_kernel( obj_t* x,
obj_t* y )
{
num_t dt_x = bli_obj_datatype( *x );
num_t dt_y = bli_obj_datatype( *y );
conj_t conjx = bli_obj_conj_status( *x );
dim_t n = bli_obj_vector_dim( *x );
inc_t inc_x = bli_obj_vector_inc( *x );
void* buf_x = bli_obj_buffer_at_off( *x );
inc_t inc_y = bli_obj_vector_inc( *y );
void* buf_y = bli_obj_buffer_at_off( *y );
FUNCPTR_T f;
// Index into the type combination array to extract the correct
// function pointer.
f = ftypes[dt_x][dt_y];
// Invoke the function.
f( conjx,
n,
buf_x, inc_x,
buf_y, inc_y );
}
#undef GENTFUNC2
#define GENTFUNC2( ctype_x, ctype_y, chx, chy, varname, kername ) \
\
void PASTEMAC2(chx,chy,varname)( \
conj_t conjx, \
dim_t n, \
void* x, inc_t incx, \
void* y, inc_t incy \
) \
{ \
PASTEMAC2(chx,chy,kername)( conjx, \
n, \
x, incx, \
y, incy ); \
}
// Define the basic set of functions unconditionally, and then also some
// mixed datatype functions if requested.
INSERT_GENTFUNC2_BASIC( copyv_kernel_void, COPYV_KERNEL )
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
INSERT_GENTFUNC2_MIX_D( copyv_kernel_void, COPYV_KERNEL )
#endif
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
INSERT_GENTFUNC2_MIX_P( copyv_kernel_void, COPYV_KERNEL )
#endif

View File

@@ -32,10 +32,14 @@
*/
void bli_copyv_unb_var1( obj_t* x,
obj_t* y );
void bli_copyv_kernel( obj_t* x,
obj_t* y );
//
// Prototype the void pointer kernel wrappers.
//
#undef GENTPROT2
#define GENTPROT2( ctype_x, ctype_y, chx, chy, varname ) \
\
@@ -46,12 +50,12 @@ void PASTEMAC2(chx,chy,varname)( \
void* y, inc_t incy \
);
INSERT_GENTPROT2_BASIC( copyv_unb_var1 )
INSERT_GENTPROT2_BASIC( copyv_kernel_void )
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
INSERT_GENTPROT2_MIX_D( copyv_unb_var1 )
INSERT_GENTPROT2_MIX_D( copyv_kernel_void )
#endif
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
INSERT_GENTPROT2_MIX_P( copyv_unb_var1 )
INSERT_GENTPROT2_MIX_P( copyv_kernel_void )
#endif

View File

@@ -34,6 +34,7 @@
#include "blis.h"
/*
#define FUNCPTR_T copyv_fp
typedef void (*FUNCPTR_T)(
@@ -46,17 +47,17 @@ typedef void (*FUNCPTR_T)(
// If some mixed datatype functions will not be compiled, we initialize
// the corresponding elements of the function array to NULL.
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
static FUNCPTR_T GENARRAY2_ALL(ftypes,copyv_unb_var1);
static FUNCPTR_T GENARRAY2_ALL(ftypes,copyv_ref);
#else
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
static FUNCPTR_T GENARRAY2_EXT(ftypes,copyv_unb_var1);
static FUNCPTR_T GENARRAY2_EXT(ftypes,copyv_ref);
#else
static FUNCPTR_T GENARRAY2_MIN(ftypes,copyv_unb_var1);
static FUNCPTR_T GENARRAY2_MIN(ftypes,copyv_ref);
#endif
#endif
void bli_copyv_unb_var1( obj_t* x,
void bli_copyv_ref( obj_t* x,
obj_t* y )
{
num_t dt_x = bli_obj_datatype( *x );
@@ -83,17 +84,19 @@ void bli_copyv_unb_var1( obj_t* x,
buf_x, inc_x,
buf_y, inc_y );
}
*/
#undef GENTFUNC2
#define GENTFUNC2( ctype_x, ctype_y, chx, chy, opname, varname ) \
\
void PASTEMAC2(chx,chy,varname)( \
conj_t conjx, \
dim_t n, \
void* x, inc_t incx, \
void* y, inc_t incy \
) \
void PASTEMAC2(chx,chy,varname) \
( \
conj_t conjx, \
dim_t n, \
ctype_x* restrict x, inc_t incx, \
ctype_y* restrict y, inc_t incy \
) \
{ \
ctype_x* x_cast = x; \
ctype_y* y_cast = y; \
@@ -130,13 +133,13 @@ void PASTEMAC2(chx,chy,varname)( \
// Define the basic set of functions unconditionally, and then also some
// mixed datatype functions if requested.
INSERT_GENTFUNC2_BASIC( copyv, copyv_unb_var1 )
INSERT_GENTFUNC2_BASIC( copyv, copyv_ref )
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
INSERT_GENTFUNC2_MIX_D( copyv, copyv_unb_var1 )
INSERT_GENTFUNC2_MIX_D( copyv, copyv_ref )
#endif
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
INSERT_GENTFUNC2_MIX_P( copyv, copyv_unb_var1 )
INSERT_GENTFUNC2_MIX_P( copyv, copyv_ref )
#endif

View File

@@ -0,0 +1,60 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/*
void bli_copyv_ref( obj_t* x,
obj_t* y );
*/
#undef GENTPROT2
#define GENTPROT2( ctype_x, ctype_y, chx, chy, varname ) \
\
void PASTEMAC2(chx,chy,varname) \
( \
conj_t conjx, \
dim_t n, \
ctype_x* restrict x, inc_t incx, \
ctype_y* restrict y, inc_t incy \
);
INSERT_GENTPROT2_BASIC( copyv_ref )
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
INSERT_GENTPROT2_MIX_D( copyv_ref )
#endif
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
INSERT_GENTPROT2_MIX_P( copyv_ref )
#endif

View File

@@ -55,7 +55,7 @@ void PASTEMAC0(opname)( \
rho ); \
}
GENFRONT( dotv, DOTV_KERNEL )
GENFRONT( dotv, dotv_kernel )
//

View File

@@ -33,7 +33,8 @@
*/
#include "bli_dotv_check.h"
#include "bli_dotv_unb_var1.h"
#include "bli_dotv_kernel.h"
#include "bli_dotv_ref.h"
//

View File

@@ -0,0 +1,128 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#define FUNCPTR_T dotv_fp
typedef void (*FUNCPTR_T)(
conj_t conjx,
conj_t conjy,
dim_t n,
void* x, inc_t incx,
void* y, inc_t incy,
void* rho
);
// If some mixed datatype functions will not be compiled, we initialize
// the corresponding elements of the function array to NULL.
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
static FUNCPTR_T GENARRAY3_ALL(ftypes,dotv_kernel_void);
#else
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
static FUNCPTR_T GENARRAY3_EXT(ftypes,dotv_kernel_void);
#else
static FUNCPTR_T GENARRAY3_MIN(ftypes,dotv_kernel_void);
#endif
#endif
void bli_dotv_kernel( obj_t* x,
obj_t* y,
obj_t* rho )
{
num_t dt_x = bli_obj_datatype( *x );
num_t dt_y = bli_obj_datatype( *y );
num_t dt_rho = bli_obj_datatype( *rho );
conj_t conjx = bli_obj_conj_status( *x );
conj_t conjy = bli_obj_conj_status( *y );
dim_t n = bli_obj_vector_dim( *x );
inc_t inc_x = bli_obj_vector_inc( *x );
void* buf_x = bli_obj_buffer_at_off( *x );
inc_t inc_y = bli_obj_vector_inc( *y );
void* buf_y = bli_obj_buffer_at_off( *y );
void* buf_rho = bli_obj_buffer_at_off( *rho );
FUNCPTR_T f;
// Index into the type combination array to extract the correct
// function pointer.
f = ftypes[dt_x][dt_y][dt_rho];
// Invoke the function.
f( conjx,
conjy,
n,
buf_x, inc_x,
buf_y, inc_y,
buf_rho );
}
#undef GENTFUNC3
#define GENTFUNC3( ctype_x, ctype_y, ctype_r, chx, chy, chr, varname, kername ) \
\
void PASTEMAC3(chx,chy,chr,varname)( \
conj_t conjx, \
conj_t conjy, \
dim_t n, \
void* x, inc_t incx, \
void* y, inc_t incy, \
void* rho \
) \
{ \
PASTEMAC3(chx,chy,chr,kername)( conjx, \
conjy, \
n, \
x, incx, \
y, incy, \
rho ); \
}
// Define the basic set of functions unconditionally, and then also some
// mixed datatype functions if requested.
INSERT_GENTFUNC3_BASIC( dotv_kernel_void, DOTV_KERNEL )
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
INSERT_GENTFUNC3_MIX_D( dotv_kernel_void, DOTV_KERNEL )
#endif
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
INSERT_GENTFUNC3_MIX_P( dotv_kernel_void, DOTV_KERNEL )
#endif

View File

@@ -32,11 +32,15 @@
*/
void bli_dotv_unb_var1( obj_t* x,
obj_t* y,
obj_t* rho );
void bli_dotv_kernel( obj_t* x,
obj_t* y,
obj_t* rho );
//
// Prototype the void pointer kernel wrappers.
//
#undef GENTPROT3
#define GENTPROT3( ctype_x, ctype_y, ctype_r, chx, chy, chr, varname ) \
\
@@ -49,13 +53,13 @@ void PASTEMAC3(chx,chy,chr,varname)( \
void* rho \
);
INSERT_GENTPROT3_BASIC( dotv_unb_var1 )
INSERT_GENTPROT3_BASIC( dotv_kernel_void )
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
INSERT_GENTPROT3_MIX_D( dotv_unb_var1 )
INSERT_GENTPROT3_MIX_D( dotv_kernel_void )
#endif
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
INSERT_GENTPROT3_MIX_P( dotv_unb_var1 )
INSERT_GENTPROT3_MIX_P( dotv_kernel_void )
#endif

View File

@@ -34,6 +34,7 @@
#include "blis.h"
/*
#define FUNCPTR_T dotv_fp
typedef void (*FUNCPTR_T)(
@@ -48,17 +49,17 @@ typedef void (*FUNCPTR_T)(
// If some mixed datatype functions will not be compiled, we initialize
// the corresponding elements of the function array to NULL.
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
static FUNCPTR_T GENARRAY3_ALL(ftypes,dotv_unb_var1);
static FUNCPTR_T GENARRAY3_ALL(ftypes,dotv_ref);
#else
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
static FUNCPTR_T GENARRAY3_EXT(ftypes,dotv_unb_var1);
static FUNCPTR_T GENARRAY3_EXT(ftypes,dotv_ref);
#else
static FUNCPTR_T GENARRAY3_MIN(ftypes,dotv_unb_var1);
static FUNCPTR_T GENARRAY3_MIN(ftypes,dotv_ref);
#endif
#endif
void bli_dotv_unb_var1( obj_t* x,
void bli_dotv_ref( obj_t* x,
obj_t* y,
obj_t* rho )
{
@@ -92,19 +93,20 @@ void bli_dotv_unb_var1( obj_t* x,
buf_y, inc_y,
buf_rho );
}
*/
#undef GENTFUNC3
#define GENTFUNC3( ctype_x, ctype_y, ctype_r, chx, chy, chr, opname, varname ) \
\
void PASTEMAC3(chx,chy,chr,varname)( \
conj_t conjx, \
conj_t conjy, \
dim_t n, \
void* x, inc_t incx, \
void* y, inc_t incy, \
void* rho \
) \
void PASTEMAC3(chx,chy,chr,varname) \
( \
conj_t conjx, \
conj_t conjy, \
dim_t n, \
ctype_x* restrict x, inc_t incx, \
ctype_y* restrict y, inc_t incy, \
ctype_r* restrict rho \
) \
{ \
ctype_x* x_cast = x; \
ctype_y* y_cast = y; \
@@ -163,13 +165,13 @@ void PASTEMAC3(chx,chy,chr,varname)( \
// Define the basic set of functions unconditionally, and then also some
// mixed datatype functions if requested.
INSERT_GENTFUNC3_BASIC( dotv, dotv_unb_var1 )
INSERT_GENTFUNC3_BASIC( dotv, dotv_ref )
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
INSERT_GENTFUNC3_MIX_D( dotv, dotv_unb_var1 )
INSERT_GENTFUNC3_MIX_D( dotv, dotv_ref )
#endif
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
INSERT_GENTFUNC3_MIX_P( dotv, dotv_unb_var1 )
INSERT_GENTFUNC3_MIX_P( dotv, dotv_ref )
#endif

View File

@@ -0,0 +1,64 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/*
void bli_dotv_ref( obj_t* x,
obj_t* y,
obj_t* rho );
*/
#undef GENTPROT3
#define GENTPROT3( ctype_x, ctype_y, ctype_r, chx, chy, chr, varname ) \
\
void PASTEMAC3(chx,chy,chr,varname) \
( \
conj_t conjx, \
conj_t conjy, \
dim_t n, \
ctype_x* restrict x, inc_t incx, \
ctype_y* restrict y, inc_t incy, \
ctype_r* restrict rho \
);
INSERT_GENTPROT3_BASIC( dotv_ref )
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
INSERT_GENTPROT3_MIX_D( dotv_ref )
#endif
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
INSERT_GENTPROT3_MIX_P( dotv_ref )
#endif

View File

@@ -59,7 +59,7 @@ void PASTEMAC0(opname)( \
rho ); \
}
GENFRONT( dotxv, DOTXV_KERNEL )
GENFRONT( dotxv, dotxv_kernel )
//

View File

@@ -33,7 +33,8 @@
*/
#include "bli_dotxv_check.h"
#include "bli_dotxv_unb_var1.h"
#include "bli_dotxv_kernel.h"
#include "bli_dotxv_ref.h"
//

View File

@@ -0,0 +1,153 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#define FUNCPTR_T dotxv_fp
typedef void (*FUNCPTR_T)(
conj_t conjx,
conj_t conjy,
dim_t n,
void* alpha,
void* x, inc_t incx,
void* y, inc_t incy,
void* beta,
void* rho
);
// If some mixed datatype functions will not be compiled, we initialize
// the corresponding elements of the function array to NULL.
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
static FUNCPTR_T GENARRAY3_ALL(ftypes,dotxv_kernel_void);
#else
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
static FUNCPTR_T GENARRAY3_EXT(ftypes,dotxv_kernel_void);
#else
static FUNCPTR_T GENARRAY3_MIN(ftypes,dotxv_kernel_void);
#endif
#endif
void bli_dotxv_kernel( obj_t* alpha,
obj_t* x,
obj_t* y,
obj_t* beta,
obj_t* rho )
{
num_t dt_x = bli_obj_datatype( *x );
num_t dt_y = bli_obj_datatype( *y );
num_t dt_rho = bli_obj_datatype( *rho );
conj_t conjx = bli_obj_conj_status( *x );
conj_t conjy = bli_obj_conj_status( *y );
dim_t n = bli_obj_vector_dim( *x );
inc_t inc_x = bli_obj_vector_inc( *x );
void* buf_x = bli_obj_buffer_at_off( *x );
inc_t inc_y = bli_obj_vector_inc( *y );
void* buf_y = bli_obj_buffer_at_off( *y );
void* buf_rho = bli_obj_buffer_at_off( *rho );
num_t dt_alpha;
void* buf_alpha;
num_t dt_beta;
void* buf_beta;
FUNCPTR_T f;
// The datatype of alpha MUST be the type union of x and y. This is to
// prevent any unnecessary loss of information during computation.
dt_alpha = bli_datatype_union( dt_x, dt_y );
buf_alpha = bli_obj_buffer_for_1x1( dt_alpha, *alpha );
// The datatype of beta MUST be the same as the datatype of rho.
dt_beta = dt_rho;
buf_beta = bli_obj_buffer_for_1x1( dt_beta, *beta );
// Index into the type combination array to extract the correct
// function pointer.
f = ftypes[dt_x][dt_y][dt_rho];
// Invoke the function.
f( conjx,
conjy,
n,
buf_alpha,
buf_x, inc_x,
buf_y, inc_y,
buf_beta,
buf_rho );
}
#undef GENTFUNC3U12
#define GENTFUNC3U12( ctype_x, ctype_y, ctype_r, ctype_xy, chx, chy, chr, chxy, varname, kername ) \
\
void PASTEMAC3(chx,chy,chr,varname)( \
conj_t conjx, \
conj_t conjy, \
dim_t n, \
void* alpha, \
void* x, inc_t incx, \
void* y, inc_t incy, \
void* beta, \
void* rho \
) \
{ \
PASTEMAC3(chx,chy,chr,kername)( conjx, \
conjy, \
n, \
alpha, \
x, incx, \
y, incy, \
beta, \
rho ); \
}
// Define the basic set of functions unconditionally, and then also some
// mixed datatype functions if requested.
INSERT_GENTFUNC3U12_BASIC( dotxv_kernel_void, DOTXV_KERNEL )
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
INSERT_GENTFUNC3U12_MIX_D( dotxv_kernel_void, DOTXV_KERNEL )
#endif
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
INSERT_GENTFUNC3U12_MIX_P( dotxv_kernel_void, DOTXV_KERNEL )
#endif

View File

@@ -32,13 +32,17 @@
*/
void bli_dotxv_unb_var1( obj_t* alpha,
obj_t* x,
obj_t* y,
obj_t* beta,
obj_t* rho );
void bli_dotxv_kernel( obj_t* alpha,
obj_t* x,
obj_t* y,
obj_t* beta,
obj_t* rho );
//
// Prototype the void pointer kernel wrappers.
//
#undef GENTPROT3U12
#define GENTPROT3U12( ctype_x, ctype_y, ctype_r, ctype_xy, chx, chy, chr, chxy, varname ) \
\
@@ -53,13 +57,13 @@ void PASTEMAC3(chx,chy,chr,varname)( \
void* rho \
);
INSERT_GENTPROT3U12_BASIC( dotxv_unb_var1 )
INSERT_GENTPROT3U12_BASIC( dotxv_kernel_void )
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
INSERT_GENTPROT3U12_MIX_D( dotxv_unb_var1 )
INSERT_GENTPROT3U12_MIX_D( dotxv_kernel_void )
#endif
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
INSERT_GENTPROT3U12_MIX_P( dotxv_unb_var1 )
INSERT_GENTPROT3U12_MIX_P( dotxv_kernel_void )
#endif

View File

@@ -34,6 +34,7 @@
#include "blis.h"
/*
#define FUNCPTR_T dotxv_fp
typedef void (*FUNCPTR_T)(
@@ -50,17 +51,17 @@ typedef void (*FUNCPTR_T)(
// If some mixed datatype functions will not be compiled, we initialize
// the corresponding elements of the function array to NULL.
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
static FUNCPTR_T GENARRAY3_ALL(ftypes,dotxv_unb_var1);
static FUNCPTR_T GENARRAY3_ALL(ftypes,dotxv_ref);
#else
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
static FUNCPTR_T GENARRAY3_EXT(ftypes,dotxv_unb_var1);
static FUNCPTR_T GENARRAY3_EXT(ftypes,dotxv_ref);
#else
static FUNCPTR_T GENARRAY3_MIN(ftypes,dotxv_unb_var1);
static FUNCPTR_T GENARRAY3_MIN(ftypes,dotxv_ref);
#endif
#endif
void bli_dotxv_unb_var1( obj_t* alpha,
void bli_dotxv_ref( obj_t* alpha,
obj_t* x,
obj_t* y,
obj_t* beta,
@@ -113,21 +114,23 @@ void bli_dotxv_unb_var1( obj_t* alpha,
buf_beta,
buf_rho );
}
*/
#undef GENTFUNC3U12
#define GENTFUNC3U12( ctype_x, ctype_y, ctype_r, ctype_xy, chx, chy, chr, chxy, opname, varname ) \
\
void PASTEMAC3(chx,chy,chr,varname)( \
conj_t conjx, \
conj_t conjy, \
dim_t n, \
void* alpha, \
void* x, inc_t incx, \
void* y, inc_t incy, \
void* beta, \
void* rho \
) \
void PASTEMAC3(chx,chy,chr,varname) \
( \
conj_t conjx, \
conj_t conjy, \
dim_t n, \
ctype_xy* restrict alpha, \
ctype_x* restrict x, inc_t incx, \
ctype_y* restrict y, inc_t incy, \
ctype_r* restrict beta, \
ctype_r* restrict rho \
) \
{ \
ctype_xy* alpha_cast = alpha; \
ctype_x* x_cast = x; \
@@ -194,13 +197,13 @@ void PASTEMAC3(chx,chy,chr,varname)( \
// Define the basic set of functions unconditionally, and then also some
// mixed datatype functions if requested.
INSERT_GENTFUNC3U12_BASIC( dotxv, dotxv_unb_var1 )
INSERT_GENTFUNC3U12_BASIC( dotxv, dotxv_ref )
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
INSERT_GENTFUNC3U12_MIX_D( dotxv, dotxv_unb_var1 )
INSERT_GENTFUNC3U12_MIX_D( dotxv, dotxv_ref )
#endif
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
INSERT_GENTFUNC3U12_MIX_P( dotxv, dotxv_unb_var1 )
INSERT_GENTFUNC3U12_MIX_P( dotxv, dotxv_ref )
#endif

View File

@@ -0,0 +1,68 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/*
void bli_dotxv_ref( obj_t* alpha,
obj_t* x,
obj_t* y,
obj_t* beta,
obj_t* rho );
*/
#undef GENTPROT3U12
#define GENTPROT3U12( ctype_x, ctype_y, ctype_r, ctype_xy, chx, chy, chr, chxy, varname ) \
\
void PASTEMAC3(chx,chy,chr,varname) \
( \
conj_t conjx, \
conj_t conjy, \
dim_t n, \
ctype_xy* restrict alpha, \
ctype_x* restrict x, inc_t incx, \
ctype_y* restrict y, inc_t incy, \
ctype_r* restrict beta, \
ctype_r* restrict rho \
);
INSERT_GENTPROT3U12_BASIC( dotxv_ref )
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
INSERT_GENTPROT3U12_MIX_D( dotxv_ref )
#endif
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
INSERT_GENTPROT3U12_MIX_P( dotxv_ref )
#endif

View File

@@ -34,15 +34,6 @@
#include "blis.h"
/*
void bli_invertv( obj_t* x )
{
if ( bli_error_checking_is_enabled() )
bli_invertv_check( x );
bli_invertv_unb_var1( x );
}
*/
//
// Define object-based interface.
@@ -60,7 +51,7 @@ void PASTEMAC0(opname)( \
PASTEMAC0(varname)( x ); \
}
GENFRONT( invertv, INVERTV_KERNEL )
GENFRONT( invertv, invertv_kernel )
//

View File

@@ -33,7 +33,8 @@
*/
#include "bli_invertv_check.h"
#include "bli_invertv_unb_var1.h"
#include "bli_invertv_kernel.h"
#include "bli_invertv_ref.h"
//

View File

@@ -34,19 +34,48 @@
#include "blis.h"
#define FUNCPTR_T invertv_fp
#undef GENTPROT
#define GENTPROT( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname)( \
dim_t k, \
ctype* restrict alpha, \
ctype* restrict a, \
ctype* restrict b, \
ctype* restrict beta, \
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
auxinfo_t* data \
typedef void (*FUNCPTR_T)(
dim_t n,
void* x, inc_t incx
);
INSERT_GENTPROT_BASIC( gemm_4x6 )
static FUNCPTR_T GENARRAY(ftypes,invertv_kernel_void);
void bli_invertv_kernel( obj_t* x )
{
num_t dt_x = bli_obj_datatype( *x );
dim_t n = bli_obj_vector_dim( *x );
inc_t inc_x = bli_obj_vector_inc( *x );
void* buf_x = bli_obj_buffer_at_off( *x );
FUNCPTR_T f;
// Index into the type combination array to extract the correct
// function pointer.
f = ftypes[dt_x];
// Invoke the function.
f( n,
buf_x, inc_x );
}
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname, kername ) \
\
void PASTEMAC(ch,varname)( \
dim_t n, \
void* x, inc_t incx \
) \
{ \
PASTEMAC(ch,kername)( n, \
x, incx ); \
}
INSERT_GENTFUNC_BASIC( invertv_kernel_void, INVERTV_KERNEL )

View File

@@ -32,9 +32,13 @@
*/
void bli_invertv_unb_var1( obj_t* x );
void bli_invertv_kernel( obj_t* x );
//
// Prototype the void pointer kernel wrappers.
//
#undef GENTPROT
#define GENTPROT( ctype, ch, varname ) \
\
@@ -43,5 +47,5 @@ void PASTEMAC(ch,varname)( \
void* x, inc_t incx \
);
INSERT_GENTPROT_BASIC( invertv_unb_var1 )
INSERT_GENTPROT_BASIC( invertv_kernel_void )

View File

@@ -34,6 +34,7 @@
#include "blis.h"
/*
#define FUNCPTR_T invertv_fp
typedef void (*FUNCPTR_T)(
@@ -41,10 +42,10 @@ typedef void (*FUNCPTR_T)(
void* x, inc_t incx
);
static FUNCPTR_T GENARRAY(ftypes,invertv_unb_var1);
static FUNCPTR_T GENARRAY(ftypes,invertv_ref);
void bli_invertv_unb_var1( obj_t* x )
void bli_invertv_ref( obj_t* x )
{
num_t dt_x = bli_obj_datatype( *x );
@@ -63,15 +64,17 @@ void bli_invertv_unb_var1( obj_t* x )
f( n,
buf_x, inc_x );
}
*/
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname, varname ) \
\
void PASTEMAC(ch,varname)( \
dim_t n, \
void* x, inc_t incx \
) \
void PASTEMAC(ch,varname) \
( \
dim_t n, \
ctype* restrict x, inc_t incx \
) \
{ \
ctype* x_cast = x; \
ctype* chi1; \
@@ -89,5 +92,5 @@ void PASTEMAC(ch,varname)( \
} \
}
INSERT_GENTFUNC_BASIC( invertv, invertv_unb_var1 )
INSERT_GENTFUNC_BASIC( invertv, invertv_ref )

View File

@@ -32,16 +32,19 @@
*/
/*
void bli_invertv_ref( obj_t* x );
*/
#undef GENTPROT
#define GENTPROT( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname)( \
ctype* restrict a, \
ctype* restrict b, \
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
auxinfo_t* data \
);
void PASTEMAC(ch,varname) \
( \
dim_t n, \
ctype* restrict x, inc_t incx \
);
INSERT_GENTPROT_BASIC( trsm_l_ref_4x4 )
INSERT_GENTPROT_BASIC( invertv_ref )

View File

@@ -68,7 +68,7 @@ void PASTEMAC0(opname)( \
y ); \
}
GENFRONT( scal2v, SCAL2V_KERNEL )
GENFRONT( scal2v, scal2v_kernel )
//

View File

@@ -33,7 +33,8 @@
*/
#include "bli_scal2v_check.h"
#include "bli_scal2v_unb_var1.h"
#include "bli_scal2v_kernel.h"
#include "bli_scal2v_ref.h"
//

View File

@@ -0,0 +1,129 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#define FUNCPTR_T scal2v_fp
typedef void (*FUNCPTR_T)(
conj_t conjx,
dim_t n,
void* beta,
void* x, inc_t incx,
void* y, inc_t incy
);
// If some mixed datatype functions will not be compiled, we initialize
// the corresponding elements of the function array to NULL.
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
static FUNCPTR_T GENARRAY3_ALL(ftypes,scal2v_kernel_void);
#else
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
static FUNCPTR_T GENARRAY3_EXT(ftypes,scal2v_kernel_void);
#else
static FUNCPTR_T GENARRAY3_MIN(ftypes,scal2v_kernel_void);
#endif
#endif
void bli_scal2v_kernel( obj_t* beta,
obj_t* x,
obj_t* y )
{
num_t dt_x = bli_obj_datatype( *x );
num_t dt_y = bli_obj_datatype( *y );
conj_t conjx = bli_obj_conj_status( *x );
dim_t n = bli_obj_vector_dim( *x );
inc_t inc_x = bli_obj_vector_inc( *x );
void* buf_x = bli_obj_buffer_at_off( *x );
inc_t inc_y = bli_obj_vector_inc( *y );
void* buf_y = bli_obj_buffer_at_off( *y );
num_t dt_beta;
void* buf_beta;
FUNCPTR_T f;
// If beta is a scalar constant, use dt_x to extract the address of the
// corresponding constant value; otherwise, use the datatype encoded
// within the beta object and extract the buffer at the beta offset.
bli_set_scalar_dt_buffer( beta, dt_x, dt_beta, buf_beta );
// Index into the type combination array to extract the correct
// function pointer.
f = ftypes[dt_beta][dt_x][dt_y];
// Invoke the function.
f( conjx,
n,
buf_beta,
buf_x, inc_x,
buf_y, inc_y );
}
#undef GENTFUNC3
#define GENTFUNC3( ctype_b, ctype_x, ctype_y, chb, chx, chy, varname, kername ) \
\
void PASTEMAC3(chb,chx,chy,varname)( \
conj_t conjx, \
dim_t n, \
void* beta, \
void* x, inc_t incx, \
void* y, inc_t incy \
) \
{ \
PASTEMAC3(chb,chx,chy,kername)( conjx, \
n, \
beta, \
x, incx, \
y, incy ); \
}
// Define the basic set of functions unconditionally, and then also some
// mixed datatype functions if requested.
INSERT_GENTFUNC3_BASIC( scal2v_kernel_void, SCAL2V_KERNEL )
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
INSERT_GENTFUNC3_MIX_D( scal2v_kernel_void, SCAL2V_KERNEL )
#endif
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
INSERT_GENTFUNC3_MIX_P( scal2v_kernel_void, SCAL2V_KERNEL )
#endif

View File

@@ -32,11 +32,15 @@
*/
void bli_scal2v_unb_var1( obj_t* beta,
obj_t* x,
obj_t* y );
void bli_scal2v_kernel( obj_t* beta,
obj_t* x,
obj_t* y );
//
// Prototype the void pointer kernel wrappers.
//
#undef GENTPROT3
#define GENTPROT3( ctype_b, ctype_x, ctype_y, chb, chx, chy, varname ) \
\
@@ -48,13 +52,13 @@ void PASTEMAC3(chb,chx,chy,varname)( \
void* y, inc_t incy \
);
INSERT_GENTPROT3_BASIC( scal2v_unb_var1 )
INSERT_GENTPROT3_BASIC( scal2v_kernel_void )
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
INSERT_GENTPROT3_MIX_D( scal2v_unb_var1 )
INSERT_GENTPROT3_MIX_D( scal2v_kernel_void )
#endif
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
INSERT_GENTPROT3_MIX_P( scal2v_unb_var1 )
INSERT_GENTPROT3_MIX_P( scal2v_kernel_void )
#endif

View File

@@ -34,6 +34,7 @@
#include "blis.h"
/*
#define FUNCPTR_T scal2v_fp
typedef void (*FUNCPTR_T)(
@@ -47,17 +48,17 @@ typedef void (*FUNCPTR_T)(
// If some mixed datatype functions will not be compiled, we initialize
// the corresponding elements of the function array to NULL.
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
static FUNCPTR_T GENARRAY3_ALL(ftypes,scal2v_unb_var1);
static FUNCPTR_T GENARRAY3_ALL(ftypes,scal2v_ref);
#else
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
static FUNCPTR_T GENARRAY3_EXT(ftypes,scal2v_unb_var1);
static FUNCPTR_T GENARRAY3_EXT(ftypes,scal2v_ref);
#else
static FUNCPTR_T GENARRAY3_MIN(ftypes,scal2v_unb_var1);
static FUNCPTR_T GENARRAY3_MIN(ftypes,scal2v_ref);
#endif
#endif
void bli_scal2v_unb_var1( obj_t* beta,
void bli_scal2v_ref( obj_t* beta,
obj_t* x,
obj_t* y )
{
@@ -94,18 +95,20 @@ void bli_scal2v_unb_var1( obj_t* beta,
buf_x, inc_x,
buf_y, inc_y );
}
*/
#undef GENTFUNC3
#define GENTFUNC3( ctype_b, ctype_x, ctype_y, chb, chx, chy, varname, setvker ) \
\
void PASTEMAC3(chb,chx,chy,varname)( \
conj_t conjx, \
dim_t n, \
void* beta, \
void* x, inc_t incx, \
void* y, inc_t incy \
) \
void PASTEMAC3(chb,chx,chy,varname) \
( \
conj_t conjx, \
dim_t n, \
ctype_b* restrict beta, \
ctype_x* restrict x, inc_t incx, \
ctype_y* restrict y, inc_t incy \
) \
{ \
ctype_b* beta_cast = beta; \
ctype_x* x_cast = x; \
@@ -155,13 +158,13 @@ void PASTEMAC3(chb,chx,chy,varname)( \
// Define the basic set of functions unconditionally, and then also some
// mixed datatype functions if requested.
INSERT_GENTFUNC3_BASIC( scal2v_unb_var1, SETV_KERNEL )
INSERT_GENTFUNC3_BASIC( scal2v_ref, SETV_KERNEL )
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
INSERT_GENTFUNC3_MIX_D( scal2v_unb_var1, SETV_KERNEL )
INSERT_GENTFUNC3_MIX_D( scal2v_ref, SETV_KERNEL )
#endif
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
INSERT_GENTFUNC3_MIX_P( scal2v_unb_var1, SETV_KERNEL )
INSERT_GENTFUNC3_MIX_P( scal2v_ref, SETV_KERNEL )
#endif

View File

@@ -0,0 +1,63 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/*
void bli_scal2v_ref( obj_t* beta,
obj_t* x,
obj_t* y );
*/
#undef GENTPROT3
#define GENTPROT3( ctype_b, ctype_x, ctype_y, chb, chx, chy, varname ) \
\
void PASTEMAC3(chb,chx,chy,varname) \
( \
conj_t conjx, \
dim_t n, \
ctype_b* restrict beta, \
ctype_x* restrict x, inc_t incx, \
ctype_y* restrict y, inc_t incy \
);
INSERT_GENTPROT3_BASIC( scal2v_ref )
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
INSERT_GENTPROT3_MIX_D( scal2v_ref )
#endif
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
INSERT_GENTPROT3_MIX_P( scal2v_ref )
#endif

View File

@@ -66,7 +66,7 @@ void PASTEMAC0(opname)( \
x ); \
}
GENFRONT( scalv, SCALV_KERNEL )
GENFRONT( scalv, scalv_kernel )
//

View File

@@ -36,7 +36,8 @@
#include "bli_scalv_check.h"
#include "bli_scalv_int.h"
#include "bli_scalv_unb_var1.h"
#include "bli_scalv_kernel.h"
#include "bli_scalv_ref.h"
//

View File

@@ -42,7 +42,7 @@ typedef void (*FUNCPTR_T)( obj_t* beta,
static FUNCPTR_T vars[1][3] =
{
// unblocked optimized unblocked blocked
{ bli_scalv_unb_var1, NULL, NULL }
{ bli_scalv_kernel, bli_scalv_kernel, NULL }
};
void bli_scalv_int( obj_t* beta,

View File

@@ -0,0 +1,120 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#define FUNCPTR_T scalv_fp
typedef void (*FUNCPTR_T)(
conj_t conjbeta,
dim_t n,
void* beta,
void* x, inc_t incx
);
// If some mixed datatype functions will not be compiled, we initialize
// the corresponding elements of the function array to NULL.
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
static FUNCPTR_T GENARRAY2_ALL(ftypes,scalv_kernel_void);
#else
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
static FUNCPTR_T GENARRAY2_EXT(ftypes,scalv_kernel_void);
#else
static FUNCPTR_T GENARRAY2_MIN(ftypes,scalv_kernel_void);
#endif
#endif
void bli_scalv_kernel( obj_t* beta,
obj_t* x )
{
num_t dt_x = bli_obj_datatype( *x );
conj_t conjbeta = bli_obj_conj_status( *beta );
dim_t n = bli_obj_vector_dim( *x );
inc_t inc_x = bli_obj_vector_inc( *x );
void* buf_x = bli_obj_buffer_at_off( *x );
num_t dt_beta;
void* buf_beta;
FUNCPTR_T f;
// If beta is a scalar constant, use dt_x to extract the address of the
// corresponding constant value; otherwise, use the datatype encoded
// within the beta object and extract the buffer at the beta offset.
bli_set_scalar_dt_buffer( beta, dt_x, dt_beta, buf_beta );
// Index into the type combination array to extract the correct
// function pointer.
f = ftypes[dt_beta][dt_x];
// Invoke the function.
f( conjbeta,
n,
buf_beta,
buf_x, inc_x );
}
#undef GENTFUNC2
#define GENTFUNC2( ctype_b, ctype_x, chb, chx, varname, kername ) \
\
void PASTEMAC2(chb,chx,varname)( \
conj_t conjbeta, \
dim_t n, \
void* beta, \
void* x, inc_t incx \
) \
{ \
PASTEMAC2(chb,chx,kername)( conjbeta, \
n, \
beta, \
x, incx ); \
}
// Define the basic set of functions unconditionally, and then also some
// mixed datatype functions if requested.
INSERT_GENTFUNC2_BASIC( scalv_kernel_void, SCALV_KERNEL )
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
INSERT_GENTFUNC2_MIX_D( scalv_kernel_void, SCALV_KERNEL )
#endif
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
INSERT_GENTFUNC2_MIX_P( scalv_kernel_void, SCALV_KERNEL )
#endif

View File

@@ -32,10 +32,14 @@
*/
void bli_scalv_unb_var1( obj_t* beta,
obj_t* x );
void bli_scalv_kernel( obj_t* beta,
obj_t* x );
//
// Prototype the void pointer kernel wrappers.
//
#undef GENTPROT2
#define GENTPROT2( ctype_b, ctype_x, chb, chx, varname ) \
\
@@ -46,13 +50,13 @@ void PASTEMAC2(chb,chx,varname)( \
void* x, inc_t incx \
);
INSERT_GENTPROT2_BASIC( scalv_unb_var1 )
INSERT_GENTPROT2_BASIC( scalv_kernel_void )
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
INSERT_GENTPROT2_MIX_D( scalv_unb_var1 )
INSERT_GENTPROT2_MIX_D( scalv_kernel_void )
#endif
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
INSERT_GENTPROT2_MIX_P( scalv_unb_var1 )
INSERT_GENTPROT2_MIX_P( scalv_kernel_void )
#endif

View File

@@ -34,6 +34,7 @@
#include "blis.h"
/*
#define FUNCPTR_T scalv_fp
typedef void (*FUNCPTR_T)(
@@ -46,17 +47,17 @@ typedef void (*FUNCPTR_T)(
// If some mixed datatype functions will not be compiled, we initialize
// the corresponding elements of the function array to NULL.
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
static FUNCPTR_T GENARRAY2_ALL(ftypes,scalv_unb_var1);
static FUNCPTR_T GENARRAY2_ALL(ftypes,scalv_ref);
#else
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
static FUNCPTR_T GENARRAY2_EXT(ftypes,scalv_unb_var1);
static FUNCPTR_T GENARRAY2_EXT(ftypes,scalv_ref);
#else
static FUNCPTR_T GENARRAY2_MIN(ftypes,scalv_unb_var1);
static FUNCPTR_T GENARRAY2_MIN(ftypes,scalv_ref);
#endif
#endif
void bli_scalv_unb_var1( obj_t* beta,
void bli_scalv_ref( obj_t* beta,
obj_t* x )
{
num_t dt_x = bli_obj_datatype( *x );
@@ -88,17 +89,19 @@ void bli_scalv_unb_var1( obj_t* beta,
buf_beta,
buf_x, inc_x );
}
*/
#undef GENTFUNC2
#define GENTFUNC2( ctype_b, ctype_x, chb, chx, varname, setvker ) \
\
void PASTEMAC2(chb,chx,varname)( \
conj_t conjbeta, \
dim_t n, \
void* beta, \
void* x, inc_t incx \
) \
void PASTEMAC2(chb,chx,varname) \
( \
conj_t conjbeta, \
dim_t n, \
ctype_b* restrict beta, \
ctype_x* restrict x, inc_t incx \
) \
{ \
ctype_b* beta_cast = beta; \
ctype_x* x_cast = x; \
@@ -136,13 +139,13 @@ void PASTEMAC2(chb,chx,varname)( \
// Define the basic set of functions unconditionally, and then also some
// mixed datatype functions if requested.
INSERT_GENTFUNC2_BASIC( scalv_unb_var1, SETV_KERNEL )
INSERT_GENTFUNC2_BASIC( scalv_ref, SETV_KERNEL )
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
INSERT_GENTFUNC2_MIX_D( scalv_unb_var1, SETV_KERNEL )
INSERT_GENTFUNC2_MIX_D( scalv_ref, SETV_KERNEL )
#endif
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
INSERT_GENTFUNC2_MIX_P( scalv_unb_var1, SETV_KERNEL )
INSERT_GENTFUNC2_MIX_P( scalv_ref, SETV_KERNEL )
#endif

View File

@@ -0,0 +1,61 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/*
void bli_scalv_ref( obj_t* beta,
obj_t* x );
*/
#undef GENTPROT2
#define GENTPROT2( ctype_b, ctype_x, chb, chx, varname ) \
\
void PASTEMAC2(chb,chx,varname) \
( \
conj_t conjbeta, \
dim_t n, \
ctype_b* restrict beta, \
ctype_x* restrict x, inc_t incx \
);
INSERT_GENTPROT2_BASIC( scalv_ref )
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
INSERT_GENTPROT2_MIX_D( scalv_ref )
#endif
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
INSERT_GENTPROT2_MIX_P( scalv_ref )
#endif

View File

@@ -67,7 +67,7 @@ void PASTEMAC0(opname)( \
x ); \
}
GENFRONT( setv, SETV_KERNEL )
GENFRONT( setv, setv_kernel )
//

View File

@@ -34,8 +34,8 @@
#include "bli_setv_check.h"
#include "bli_setv_unb_var1.h"
#include "bli_setv_unb_var2.h"
#include "bli_setv_kernel.h"
#include "bli_setv_ref.h"
//

View File

@@ -0,0 +1,113 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#define FUNCPTR_T setv_fp
typedef void (*FUNCPTR_T)(
dim_t n,
void* beta,
void* x, inc_t incx
);
// If some mixed datatype functions will not be compiled, we initialize
// the corresponding elements of the function array to NULL.
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
static FUNCPTR_T GENARRAY2_ALL(ftypes,setv_kernel_void);
#else
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
static FUNCPTR_T GENARRAY2_EXT(ftypes,setv_kernel_void);
#else
static FUNCPTR_T GENARRAY2_MIN(ftypes,setv_kernel_void);
#endif
#endif
void bli_setv_kernel( obj_t* beta,
obj_t* x )
{
num_t dt_x = bli_obj_datatype( *x );
dim_t n = bli_obj_vector_dim( *x );
void* buf_x = bli_obj_buffer_at_off( *x );
inc_t inc_x = bli_obj_vector_inc( *x );
void* buf_beta;
num_t dt_beta;
FUNCPTR_T f;
// If beta is a scalar constant, use dt_x to extract the address of the
// corresponding constant value; otherwise, use the datatype encoded
// within the beta object and extract the buffer at the beta offset.
bli_set_scalar_dt_buffer( beta, dt_x, dt_beta, buf_beta );
// Index into the type combination array to extract the correct
// function pointer.
f = ftypes[dt_beta][dt_x];
// Invoke the function.
f( n,
buf_beta,
buf_x, inc_x );
}
#undef GENTFUNC2
#define GENTFUNC2( ctype_b, ctype_x, chb, chx, varname, kername ) \
\
void PASTEMAC2(chb,chx,varname)( \
dim_t n, \
void* beta, \
void* x, inc_t incx \
) \
{ \
PASTEMAC2(chb,chx,kername)( n, \
beta, \
x, incx ); \
}
// Define the basic set of functions unconditionally, and then also some
// mixed datatype functions if requested.
INSERT_GENTFUNC2_BASIC( setv_kernel_void, SETV_KERNEL )
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
INSERT_GENTFUNC2_MIX_D( setv_kernel_void, SETV_KERNEL )
#endif
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
INSERT_GENTFUNC2_MIX_P( setv_kernel_void, SETV_KERNEL )
#endif

View File

@@ -32,10 +32,14 @@
*/
void bli_setv_unb_var1( obj_t* beta,
obj_t* x );
void bli_setv_kernel( obj_t* beta,
obj_t* x );
//
// Prototype the void pointer kernel wrappers.
//
#undef GENTPROT2
#define GENTPROT2( ctype_b, ctype_x, chb, chx, varname ) \
\
@@ -45,13 +49,13 @@ void PASTEMAC2(chb,chx,varname)( \
void* x, inc_t incx \
);
INSERT_GENTPROT2_BASIC( setv_unb_var1 )
INSERT_GENTPROT2_BASIC( setv_kernel_void )
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
INSERT_GENTPROT2_MIX_D( setv_unb_var1 )
INSERT_GENTPROT2_MIX_D( setv_kernel_void )
#endif
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
INSERT_GENTPROT2_MIX_P( setv_unb_var1 )
INSERT_GENTPROT2_MIX_P( setv_kernel_void )
#endif

View File

@@ -34,6 +34,7 @@
#include "blis.h"
/*
#define FUNCPTR_T setv_fp
typedef void (*FUNCPTR_T)(
@@ -45,17 +46,17 @@ typedef void (*FUNCPTR_T)(
// If some mixed datatype functions will not be compiled, we initialize
// the corresponding elements of the function array to NULL.
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
static FUNCPTR_T GENARRAY2_ALL(ftypes,setv_unb_var1);
static FUNCPTR_T GENARRAY2_ALL(ftypes,setv_ref);
#else
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
static FUNCPTR_T GENARRAY2_EXT(ftypes,setv_unb_var1);
static FUNCPTR_T GENARRAY2_EXT(ftypes,setv_ref);
#else
static FUNCPTR_T GENARRAY2_MIN(ftypes,setv_unb_var1);
static FUNCPTR_T GENARRAY2_MIN(ftypes,setv_ref);
#endif
#endif
void bli_setv_unb_var1( obj_t* beta,
void bli_setv_ref( obj_t* beta,
obj_t* x )
{
num_t dt_x = bli_obj_datatype( *x );
@@ -84,16 +85,17 @@ void bli_setv_unb_var1( obj_t* beta,
buf_beta,
buf_x, inc_x );
}
*/
#undef GENTFUNC2
#define GENTFUNC2( ctype_b, ctype_x, chb, chx, opname, varname ) \
\
void PASTEMAC2(chb,chx,varname)( \
dim_t n, \
void* beta, \
void* x, inc_t incx \
) \
void PASTEMAC2(chb,chx,varname) \
( \
dim_t n, \
ctype_b* restrict beta, \
ctype_x* restrict x, inc_t incx \
) \
{ \
ctype_b* beta_cast = beta; \
ctype_x* chi1 = x; \
@@ -123,12 +125,12 @@ void PASTEMAC2(chb,chx,varname)( \
// Define the basic set of functions unconditionally, and then also some
// mixed datatype functions if requested.
INSERT_GENTFUNC2_BASIC( setv, setv_unb_var1 )
INSERT_GENTFUNC2_BASIC( setv, setv_ref )
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
INSERT_GENTFUNC2_MIX_D( setv, setv_unb_var1 )
INSERT_GENTFUNC2_MIX_D( setv, setv_ref )
#endif
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
INSERT_GENTFUNC2_MIX_P( setv, setv_unb_var1 )
INSERT_GENTFUNC2_MIX_P( setv, setv_ref )
#endif

View File

@@ -0,0 +1,60 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/*
void bli_setv_ref( obj_t* beta,
obj_t* x );
*/
#undef GENTPROT2
#define GENTPROT2( ctype_b, ctype_x, chb, chx, varname ) \
\
void PASTEMAC2(chb,chx,varname) \
( \
dim_t n, \
ctype_b* restrict beta, \
ctype_x* restrict x, inc_t incx \
);
INSERT_GENTPROT2_BASIC( setv_ref )
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
INSERT_GENTPROT2_MIX_D( setv_ref )
#endif
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
INSERT_GENTPROT2_MIX_P( setv_ref )
#endif

View File

@@ -53,7 +53,7 @@ void PASTEMAC0(opname)( \
y ); \
}
GENFRONT( subv, SUBV_KERNEL )
GENFRONT( subv, subv_kernel )
//

View File

@@ -33,7 +33,8 @@
*/
#include "bli_subv_check.h"
#include "bli_subv_unb_var1.h"
#include "bli_subv_kernel.h"
#include "bli_subv_ref.h"
//

View File

@@ -0,0 +1,115 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#define FUNCPTR_T subv_fp
typedef void (*FUNCPTR_T)(
conj_t conjx,
dim_t n,
void* x, inc_t incx,
void* y, inc_t incy
);
// If some mixed datatype functions will not be compiled, we initialize
// the corresponding elements of the function array to NULL.
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
static FUNCPTR_T GENARRAY2_ALL(ftypes,subv_kernel_void);
#else
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
static FUNCPTR_T GENARRAY2_EXT(ftypes,subv_kernel_void);
#else
static FUNCPTR_T GENARRAY2_MIN(ftypes,subv_kernel_void);
#endif
#endif
void bli_subv_kernel( obj_t* x,
obj_t* y )
{
num_t dt_x = bli_obj_datatype( *x );
num_t dt_y = bli_obj_datatype( *y );
conj_t conjx = bli_obj_conj_status( *x );
dim_t n = bli_obj_vector_dim( *x );
inc_t inc_x = bli_obj_vector_inc( *x );
void* buf_x = bli_obj_buffer_at_off( *x );
inc_t inc_y = bli_obj_vector_inc( *y );
void* buf_y = bli_obj_buffer_at_off( *y );
FUNCPTR_T f;
// Index into the type combination array to extract the correct
// function pointer.
f = ftypes[dt_x][dt_y];
// Invoke the function.
f( conjx,
n,
buf_x, inc_x,
buf_y, inc_y );
}
#undef GENTFUNC2
#define GENTFUNC2( ctype_x, ctype_y, chx, chy, varname, kername ) \
\
void PASTEMAC2(chx,chy,varname)( \
conj_t conjx, \
dim_t n, \
void* x, inc_t incx, \
void* y, inc_t incy \
) \
{ \
PASTEMAC2(chx,chy,kername)( conjx, \
n, \
x, incx, \
y, incy ); \
}
// Define the basic set of functions unconditionally, and then also some
// mixed datatype functions if requested.
INSERT_GENTFUNC2_BASIC( subv_kernel_void, SUBV_KERNEL )
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
INSERT_GENTFUNC2_MIX_D( subv_kernel_void, SUBV_KERNEL )
#endif
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
INSERT_GENTFUNC2_MIX_P( subv_kernel_void, SUBV_KERNEL )
#endif

View File

@@ -32,10 +32,14 @@
*/
void bli_addv_unb_var1( obj_t* x,
obj_t* y );
void bli_subv_kernel( obj_t* x,
obj_t* y );
//
// Prototype the void pointer kernel wrappers.
//
#undef GENTPROT2
#define GENTPROT2( ctype_x, ctype_y, chx, chy, varname ) \
\
@@ -46,12 +50,12 @@ void PASTEMAC2(chx,chy,varname)( \
void* y, inc_t incy \
);
INSERT_GENTPROT2_BASIC( addv_unb_var1 )
INSERT_GENTPROT2_BASIC( subv_kernel_void )
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
INSERT_GENTPROT2_MIX_D( addv_unb_var1 )
INSERT_GENTPROT2_MIX_D( subv_kernel_void )
#endif
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
INSERT_GENTPROT2_MIX_P( addv_unb_var1 )
INSERT_GENTPROT2_MIX_P( subv_kernel_void )
#endif

View File

@@ -34,6 +34,7 @@
#include "blis.h"
/*
#define FUNCPTR_T subv_fp
typedef void (*FUNCPTR_T)(
@@ -46,17 +47,17 @@ typedef void (*FUNCPTR_T)(
// If some mixed datatype functions will not be compiled, we initialize
// the corresponding elements of the function array to NULL.
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
static FUNCPTR_T GENARRAY2_ALL(ftypes,subv_unb_var1);
static FUNCPTR_T GENARRAY2_ALL(ftypes,subv_ref);
#else
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
static FUNCPTR_T GENARRAY2_EXT(ftypes,subv_unb_var1);
static FUNCPTR_T GENARRAY2_EXT(ftypes,subv_ref);
#else
static FUNCPTR_T GENARRAY2_MIN(ftypes,subv_unb_var1);
static FUNCPTR_T GENARRAY2_MIN(ftypes,subv_ref);
#endif
#endif
void bli_subv_unb_var1( obj_t* x,
void bli_subv_ref( obj_t* x,
obj_t* y )
{
num_t dt_x = bli_obj_datatype( *x );
@@ -83,17 +84,19 @@ void bli_subv_unb_var1( obj_t* x,
buf_x, inc_x,
buf_y, inc_y );
}
*/
#undef GENTFUNC2
#define GENTFUNC2( ctype_x, ctype_y, chx, chy, opname, varname ) \
\
void PASTEMAC2(chx,chy,varname)( \
conj_t conjx, \
dim_t n, \
void* x, inc_t incx, \
void* y, inc_t incy \
) \
void PASTEMAC2(chx,chy,varname) \
( \
conj_t conjx, \
dim_t n, \
ctype_x* restrict x, inc_t incx, \
ctype_y* restrict y, inc_t incy \
) \
{ \
ctype_x* x_cast = x; \
ctype_y* y_cast = y; \
@@ -130,13 +133,13 @@ void PASTEMAC2(chx,chy,varname)( \
// Define the basic set of functions unconditionally, and then also some
// mixed datatype functions if requested.
INSERT_GENTFUNC2_BASIC( subv, subv_unb_var1 )
INSERT_GENTFUNC2_BASIC( subv, subv_ref )
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
INSERT_GENTFUNC2_MIX_D( subv, subv_unb_var1 )
INSERT_GENTFUNC2_MIX_D( subv, subv_ref )
#endif
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
INSERT_GENTFUNC2_MIX_P( subv, subv_unb_var1 )
INSERT_GENTFUNC2_MIX_P( subv, subv_ref )
#endif

View File

@@ -0,0 +1,60 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/*
void bli_subv_ref( obj_t* x,
obj_t* y );
*/
#undef GENTPROT2
#define GENTPROT2( ctype_x, ctype_y, chx, chy, varname ) \
\
void PASTEMAC2(chx,chy,varname) \
( \
conj_t conjx, \
dim_t n, \
ctype_x* restrict x, inc_t incx, \
ctype_y* restrict y, inc_t incy \
);
INSERT_GENTPROT2_BASIC( subv_ref )
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
INSERT_GENTPROT2_MIX_D( subv_ref )
#endif
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
INSERT_GENTPROT2_MIX_P( subv_ref )
#endif

View File

@@ -53,7 +53,7 @@ void PASTEMAC0(opname)( \
y ); \
}
GENFRONT( swapv, SWAPV_KERNEL )
GENFRONT( swapv, swapv_kernel )
//

View File

@@ -33,7 +33,8 @@
*/
#include "bli_swapv_check.h"
#include "bli_swapv_unb_var1.h"
#include "bli_swapv_kernel.h"
#include "bli_swapv_ref.h"
//

View File

@@ -0,0 +1,110 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#define FUNCPTR_T swapv_fp
typedef void (*FUNCPTR_T)(
dim_t n,
void* x, inc_t incx,
void* y, inc_t incy
);
// If some mixed datatype functions will not be compiled, we initialize
// the corresponding elements of the function array to NULL.
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
static FUNCPTR_T GENARRAY2_ALL(ftypes,swapv_kernel_void);
#else
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
static FUNCPTR_T GENARRAY2_EXT(ftypes,swapv_kernel_void);
#else
static FUNCPTR_T GENARRAY2_MIN(ftypes,swapv_kernel_void);
#endif
#endif
void bli_swapv_kernel( obj_t* x,
obj_t* y )
{
num_t dt_x = bli_obj_datatype( *x );
num_t dt_y = bli_obj_datatype( *y );
dim_t n = bli_obj_vector_dim( *x );
inc_t inc_x = bli_obj_vector_inc( *x );
void* buf_x = bli_obj_buffer_at_off( *x );
inc_t inc_y = bli_obj_vector_inc( *y );
void* buf_y = bli_obj_buffer_at_off( *y );
FUNCPTR_T f;
// Index into the type combination array to extract the correct
// function pointer.
f = ftypes[dt_x][dt_y];
// Invoke the function.
f( n,
buf_x, inc_x,
buf_y, inc_y );
}
#undef GENTFUNC2
#define GENTFUNC2( ctype_x, ctype_y, chx, chy, varname, kername ) \
\
void PASTEMAC2(chx,chy,varname)( \
dim_t n, \
void* x, inc_t incx, \
void* y, inc_t incy \
) \
{ \
PASTEMAC2(chx,chy,kername)( n, \
x, incx, \
y, incy ); \
}
// Define the basic set of functions unconditionally, and then also some
// mixed datatype functions if requested.
INSERT_GENTFUNC2_BASIC( swapv_kernel_void, SWAPV_KERNEL )
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
INSERT_GENTFUNC2_MIX_D( swapv_kernel_void, SWAPV_KERNEL )
#endif
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
INSERT_GENTFUNC2_MIX_P( swapv_kernel_void, SWAPV_KERNEL )
#endif

View File

@@ -32,10 +32,14 @@
*/
void bli_swapv_unb_var1( obj_t* x,
obj_t* y );
void bli_swapv_kernel( obj_t* x,
obj_t* y );
//
// Prototype the void pointer kernel wrappers.
//
#undef GENTPROT2
#define GENTPROT2( ctype_x, ctype_y, chx, chy, varname ) \
\
@@ -45,12 +49,12 @@ void PASTEMAC2(chx,chy,varname)( \
void* y, inc_t incy \
);
INSERT_GENTPROT2_BASIC( swapv_unb_var1 )
INSERT_GENTPROT2_BASIC( swapv_kernel_void )
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
INSERT_GENTPROT2_MIX_D( swapv_unb_var1 )
INSERT_GENTPROT2_MIX_D( swapv_kernel_void )
#endif
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
INSERT_GENTPROT2_MIX_P( swapv_unb_var1 )
INSERT_GENTPROT2_MIX_P( swapv_kernel_void )
#endif

Some files were not shown because too many files have changed in this diff Show More