Merge pull request #3 from figual/master

New ARM armv7a kernels and Assembly file consideration in Makefile
This commit is contained in:
Field G. Van Zee
2014-02-21 09:04:13 -06:00
10 changed files with 2819 additions and 7 deletions

View File

@@ -317,18 +317,24 @@ CFLAGS_KERNELS := $(CFLAGS_KERNELS) $(VERS_DEF)
# Convert source file paths to object file paths by replacing the base source
# directories with the base object directories, and also replacing the source
# file suffix (eg: '.c') with '.o'.
MK_BLIS_CONFIG_OBJS := $(patsubst $(FRAME_PATH)/%.c, $(BASE_OBJ_FRAME_PATH)/%.o, \
MK_BLIS_FRAME_OBJS := $(patsubst $(FRAME_PATH)/%.c, $(BASE_OBJ_FRAME_PATH)/%.o, \
$(filter %.c, $(MK_FRAME_SRC)))
MK_BLIS_CONFIG_NOOPT_OBJS := $(patsubst $(FRAME_PATH)/%.c, $(BASE_OBJ_FRAME_PATH)/%.o, \
MK_BLIS_FRAME_NOOPT_OBJS := $(patsubst $(FRAME_PATH)/%.c, $(BASE_OBJ_FRAME_PATH)/%.o, \
$(filter %.c, $(MK_FRAME_NOOPT_SRC)))
MK_BLIS_CONFIG_KERNELS_OBJS := $(patsubst $(FRAME_PATH)/%.c, $(BASE_OBJ_FRAME_PATH)/%.o, \
MK_BLIS_FRAME_KERNELS_OBJS := $(patsubst $(FRAME_PATH)/%.c, $(BASE_OBJ_FRAME_PATH)/%.o, \
$(filter %.c, $(MK_FRAME_KERNELS_SRC)))
MK_BLIS_FRAME_OBJS := $(patsubst $(CONFIG_PATH)/%.c, $(BASE_OBJ_CONFIG_PATH)/%.o, \
MK_BLIS_CONFIG_OBJS := $(patsubst $(CONFIG_PATH)/%.S, $(BASE_OBJ_CONFIG_PATH)/%.o, \
$(filter %.S, $(MK_CONFIG_SRC)))
MK_BLIS_CONFIG_OBJS += $(patsubst $(CONFIG_PATH)/%.c, $(BASE_OBJ_CONFIG_PATH)/%.o, \
$(filter %.c, $(MK_CONFIG_SRC)))
MK_BLIS_FRAME_NOOPT_OBJS := $(patsubst $(CONFIG_PATH)/%.c, $(BASE_OBJ_CONFIG_PATH)/%.o, \
MK_BLIS_CONFIG_NOOPT_OBJS := $(patsubst $(CONFIG_PATH)/%.S, $(BASE_OBJ_CONFIG_PATH)/%.o, \
$(filter %.S, $(MK_CONFIG_NOOPT_SRC)))
MK_BLIS_CONFIG_NOOPT_OBJS += $(patsubst $(CONFIG_PATH)/%.c, $(BASE_OBJ_CONFIG_PATH)/%.o, \
$(filter %.c, $(MK_CONFIG_NOOPT_SRC)))
MK_BLIS_FRAME_KERNELS_OBJS := $(patsubst $(CONFIG_PATH)/%.c, $(BASE_OBJ_CONFIG_PATH)/%.o, \
MK_BLIS_CONFIG_KERNELS_OBJS := $(patsubst $(CONFIG_PATH)/%.S, $(BASE_OBJ_CONFIG_PATH)/%.o, \
$(filter %.S, $(MK_CONFIG_KERNELS_SRC)))
MK_BLIS_CONFIG_KERNELS_OBJS += $(patsubst $(CONFIG_PATH)/%.c, $(BASE_OBJ_CONFIG_PATH)/%.o, \
$(filter %.c, $(MK_CONFIG_KERNELS_SRC)))
# Combine all of the object files into some readily-accessible variables.
@@ -427,7 +433,7 @@ else
@$(CC) $(call get_cflags_for_obj,$@) -c $< -o $@
endif
$(BASE_OBJ_CONFIG_PATH)/%.o: $(CONFIG_PATH)/%.c $(MK_HEADER_FILES) $(MAKE_DEFS_MK_PATH)
$(BASE_OBJ_CONFIG_PATH)/%.o: $(CONFIG_PATH)/%.[cS] $(MK_HEADER_FILES) $(MAKE_DEFS_MK_PATH)
ifeq ($(BLIS_ENABLE_VERBOSE_MAKE_OUTPUT),yes)
$(CC) $(call get_cflags_for_obj,$@) -c $< -o $@
else

169
config/armv7a/bli_config.h Normal file
View File

@@ -0,0 +1,169 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef BLIS_CONFIG_H
#define BLIS_CONFIG_H
// -- OPERATING SYSTEM ---------------------------------------------------------
// -- INTEGER PROPERTIES -------------------------------------------------------
// The bit size of the integer type used to track values such as dimensions,
// strides, diagonal offsets. A value of 32 results in BLIS using 32-bit signed
// integers while 64 results in 64-bit integers. Any other value results in use
// of the C99 type "long int". Note that this ONLY affects integers used
// internally within BLIS as well as those exposed in the native BLAS-like BLIS
// interface.
#define BLIS_INT_TYPE_SIZE 32
// -- FLOATING-POINT PROPERTIES ------------------------------------------------
// Define the number of floating-point types supported, and the size of the
// largest type.
#define BLIS_NUM_FP_TYPES 4
#define BLIS_MAX_TYPE_SIZE sizeof(dcomplex)
// Enable use of built-in C99 "float complex" and "double complex" types and
// associated overloaded operations and functions? Disabling results in
// scomplex and dcomplex being defined in terms of simple structs.
//#define BLIS_ENABLE_C99_COMPLEX
// -- MULTITHREADING -----------------------------------------------------------
// The maximum number of BLIS threads that will run concurrently.
#define BLIS_MAX_NUM_THREADS 1
// -- MEMORY ALLOCATION --------------------------------------------------------
// -- Contiguous (static) memory allocator --
// The number of MC x KC, KC x NC, and MC x NC blocks to reserve in the
// contiguous memory pools.
#define BLIS_NUM_MC_X_KC_BLOCKS BLIS_MAX_NUM_THREADS
#define BLIS_NUM_KC_X_NC_BLOCKS BLIS_MAX_NUM_THREADS
#define BLIS_NUM_MC_X_NC_BLOCKS 0
// The maximum preload byte offset is used to pad the end of the contiguous
// memory pools so that the micro-kernel, when computing with the end of the
// last block, can exceed the bounds of the usable portion of the memory
// region without causing a segmentation fault.
#define BLIS_MAX_PRELOAD_BYTE_OFFSET 128
// -- Memory alignment --
// It is sometimes useful to define the various memory alignments in terms
// of some other characteristics of the system, such as the cache line size
// and the page size.
#define BLIS_CACHE_LINE_SIZE 32
#define BLIS_PAGE_SIZE 4096
// Alignment size needed by the instruction set for aligned SIMD/vector
// instructions.
#define BLIS_SIMD_ALIGN_SIZE 32
// Alignment size used to align local stack buffers within macro-kernel
// functions.
#define BLIS_STACK_BUF_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE
// Alignment size used when allocating memory dynamically from the operating
// system (eg: posix_memalign()). To disable heap alignment and just use
// malloc() instead, set this to 1.
#define BLIS_HEAP_ADDR_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE
// Alignment size used when sizing leading dimensions of dynamically
// allocated memory.
#define BLIS_HEAP_STRIDE_ALIGN_SIZE BLIS_CACHE_LINE_SIZE
// Alignment size used when allocating entire blocks of contiguous memory
// from the contiguous memory allocator.
#define BLIS_CONTIG_ADDR_ALIGN_SIZE BLIS_PAGE_SIZE
// Alignment size used when sizing strides (eg: of packed micro-panels)
// within a block of contiguous memory.
#define BLIS_CONTIG_STRIDE_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE
// -- MIXED DATATYPE SUPPORT ---------------------------------------------------
// Basic (homogeneous) datatype support always enabled.
// Enable mixed domain operations?
//#define BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
// Enable extra mixed precision operations?
//#define BLIS_ENABLE_MIXED_PRECISION_SUPPORT
// -- MISCELLANEOUS OPTIONS ----------------------------------------------------
// Stay initialized after auto-initialization, unless and until the user
// explicitly calls bli_finalize().
#define BLIS_ENABLE_STAY_AUTO_INITIALIZED
// -- BLAS-to-BLIS COMPATIBILITY LAYER -----------------------------------------
// Enable the BLAS compatibility layer?
#define BLIS_ENABLE_BLAS2BLIS
// The bit size of the integer type used to track values such as dimensions and
// leading dimensions (ie: column strides) within the BLAS compatibility layer.
// A value of 32 results in the compatibility layer using 32-bit signed integers
// while 64 results in 64-bit integers. Any other value results in use of the
// C99 type "long int". Note that this ONLY affects integers used within the
// BLAS compatibility layer.
#define BLIS_BLAS2BLIS_INT_TYPE_SIZE 32
// Fortran-77 name-mangling macros.
#define PASTEF770(name) name ## _
#define PASTEF77(ch1,name) ch1 ## name ## _
#define PASTEF772(ch1,ch2,name) ch1 ## ch2 ## name ## _
#endif

348
config/armv7a/bli_kernel.h Normal file
View File

@@ -0,0 +1,348 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef BLIS_KERNEL_H
#define BLIS_KERNEL_H
// -- LEVEL-3 MICRO-KERNEL CONSTANTS -------------------------------------------
// -- Default cache blocksizes --
//
// Constraints:
//
// (1) MC must be a multiple of:
// (a) MR (for zero-padding purposes)
// (b) NR (for zero-padding purposes when MR and NR are "swapped")
// (2) NC must be a multiple of
// (a) NR (for zero-padding purposes)
// (b) MR (for zero-padding purposes when MR and NR are "swapped")
// (3) KC must be a multiple of
// (a) MR and
// (b) NR (for triangular operations such as trmm and trsm).
//
#define BLIS_DEFAULT_MC_S 432
#define BLIS_DEFAULT_KC_S 352
#define BLIS_DEFAULT_NC_S 4096
#define BLIS_DEFAULT_MC_D 192
#define BLIS_DEFAULT_KC_D 256
#define BLIS_DEFAULT_NC_D 4096
#define BLIS_DEFAULT_MC_C 64
#define BLIS_DEFAULT_KC_C 128
#define BLIS_DEFAULT_NC_C 4096
#define BLIS_DEFAULT_MC_Z 64
#define BLIS_DEFAULT_KC_Z 128
#define BLIS_DEFAULT_NC_Z 4096
// -- Cache blocksize extensions (for optimizing edge cases) --
// NOTE: These cache blocksize "extensions" have the same constraints as
// the corresponding default blocksizes above. When these values are
// non-zero, blocksizes used at edge cases are extended (enlarged) if
// such an extension would encompass the remaining portion of the
// matrix dimension.
#define BLIS_EXTEND_MC_S 0 //(BLIS_DEFAULT_MC_S/4)
#define BLIS_EXTEND_KC_S 0 //(BLIS_DEFAULT_KC_S/4)
#define BLIS_EXTEND_NC_S 0 //(BLIS_DEFAULT_NC_S/4)
#define BLIS_EXTEND_MC_D 0 //(BLIS_DEFAULT_MC_D/4)
#define BLIS_EXTEND_KC_D 0 //(BLIS_DEFAULT_KC_D/4)
#define BLIS_EXTEND_NC_D 0 //(BLIS_DEFAULT_NC_D/4)
#define BLIS_EXTEND_MC_C 0 //(BLIS_DEFAULT_MC_C/4)
#define BLIS_EXTEND_KC_C 0 //(BLIS_DEFAULT_KC_C/4)
#define BLIS_EXTEND_NC_C 0 //(BLIS_DEFAULT_NC_C/4)
#define BLIS_EXTEND_MC_Z 0 //(BLIS_DEFAULT_MC_Z/4)
#define BLIS_EXTEND_KC_Z 0 //(BLIS_DEFAULT_KC_Z/4)
#define BLIS_EXTEND_NC_Z 0 //(BLIS_DEFAULT_NC_Z/4)
// -- Default register blocksizes for micro-kernel --
// NOTE: When using the reference configuration, these register blocksizes
// in the m and n dimensions should all be equal to the size expected by
// the reference micro-kernel(s).
#define BLIS_DEFAULT_MR_S 4
#define BLIS_DEFAULT_NR_S 4
#define BLIS_DEFAULT_MR_D 4
#define BLIS_DEFAULT_NR_D 4
#define BLIS_DEFAULT_MR_C 2
#define BLIS_DEFAULT_NR_C 2
#define BLIS_DEFAULT_MR_Z 2
#define BLIS_DEFAULT_NR_Z 2
// NOTE: If the micro-kernel, which is typically unrolled to a factor
// of f, handles leftover edge cases (ie: when k % f > 0) then these
// register blocksizes in the k dimension can be defined to 1.
#define BLIS_DEFAULT_KR_S 1
#define BLIS_DEFAULT_KR_D 1
#define BLIS_DEFAULT_KR_C 1
#define BLIS_DEFAULT_KR_Z 1
// -- Register blocksize extensions (for packed micro-panels) --
// NOTE: These register blocksize "extensions" determine whether the
// leading dimensions used within the packed micro-panels are equal to
// or greater than their corresponding register blocksizes above.
#define BLIS_EXTEND_MR_S 0
#define BLIS_EXTEND_NR_S 0
#define BLIS_EXTEND_MR_D 0
#define BLIS_EXTEND_NR_D 0
#define BLIS_EXTEND_MR_C 0
#define BLIS_EXTEND_NR_C 0
#define BLIS_EXTEND_MR_Z 0
#define BLIS_EXTEND_NR_Z 0
// Register blocksize extensions in the k dimension are not used.
#define BLIS_EXTEND_KR_S 0
#define BLIS_EXTEND_KR_D 0
#define BLIS_EXTEND_KR_C 0
#define BLIS_EXTEND_KR_Z 0
// -- Default incremental packing blocksizes (n dimension) --
// NOTE: These incremental packing blocksizes (for the n dimension) are only
// used by certain blocked variants. But when the *are* used, they MUST be
// be an integer multiple of NR!
#define BLIS_DEFAULT_NI_FAC 16
#define BLIS_DEFAULT_NI_S (BLIS_DEFAULT_NI_FAC * BLIS_DEFAULT_NR_S)
#define BLIS_DEFAULT_NI_D (BLIS_DEFAULT_NI_FAC * BLIS_DEFAULT_NR_D)
#define BLIS_DEFAULT_NI_C (BLIS_DEFAULT_NI_FAC * BLIS_DEFAULT_NR_C)
#define BLIS_DEFAULT_NI_Z (BLIS_DEFAULT_NI_FAC * BLIS_DEFAULT_NR_Z)
// -- LEVEL-2 KERNEL CONSTANTS -------------------------------------------------
// NOTE: These values determine high-level cache blocking for level-2
// operations ONLY. So, if gemv is performed with a 2000x2000 matrix A and
// MC = NC = 1000, then a total of four unblocked (or unblocked fused)
// gemv subproblems are called. The blocked algorithms are only useful in
// that they provide the opportunity for packing vectors. (Matrices can also
// be packed here, but this tends to be much too expensive in practice to
// actually employ.)
#define BLIS_DEFAULT_L2_MC_S 1000
#define BLIS_DEFAULT_L2_NC_S 1000
#define BLIS_DEFAULT_L2_MC_D 1000
#define BLIS_DEFAULT_L2_NC_D 1000
#define BLIS_DEFAULT_L2_MC_C 1000
#define BLIS_DEFAULT_L2_NC_C 1000
#define BLIS_DEFAULT_L2_MC_Z 1000
#define BLIS_DEFAULT_L2_NC_Z 1000
// -- LEVEL-1F KERNEL CONSTANTS ------------------------------------------------
// -- Default fusing factors for level-1f operations --
// NOTE: Default fusing factors are not used by the reference implementations
// of level-1f operations. They are here only for use when these operations
// are optimized.
#define BLIS_DEFAULT_FUSE_FAC_S 8
#define BLIS_DEFAULT_FUSE_FAC_D 4
#define BLIS_DEFAULT_FUSE_FAC_C 4
#define BLIS_DEFAULT_FUSE_FAC_Z 2
#define BLIS_AXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
#define BLIS_AXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
#define BLIS_AXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
#define BLIS_AXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
#define BLIS_DOTXF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
#define BLIS_DOTXF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
#define BLIS_DOTXF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
#define BLIS_DOTXF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
#define BLIS_DOTXAXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
#define BLIS_DOTXAXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
#define BLIS_DOTXAXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
#define BLIS_DOTXAXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
// -- LEVEL-1V KERNEL CONSTANTS ------------------------------------------------
// -- Default register blocksizes for vectors --
// NOTE: Register blocksizes for vectors are used when packing
// non-contiguous vectors. Similar to that of KR, they can
// typically be set to 1.
#define BLIS_DEFAULT_VR_S 1
#define BLIS_DEFAULT_VR_D 1
#define BLIS_DEFAULT_VR_C 1
#define BLIS_DEFAULT_VR_Z 1
// -- LEVEL-3 KERNEL DEFINITIONS -----------------------------------------------
// -- gemm --
#include "bli_gemm_opt_4x4.h"
#define GEMM_UKERNEL gemm_opt_4x4
// -- trsm-related --
#define GEMMTRSM_L_UKERNEL gemmtrsm_l_ref_mxn
#define GEMMTRSM_U_UKERNEL gemmtrsm_u_ref_mxn
#define TRSM_L_UKERNEL trsm_l_ref_mxn
#define TRSM_U_UKERNEL trsm_u_ref_mxn
// -- LEVEL-1M KERNEL DEFINITIONS ----------------------------------------------
// -- packm --
#define PACKM_2XK_KERNEL packm_ref_2xk
#define PACKM_4XK_KERNEL packm_ref_4xk
#define PACKM_6XK_KERNEL packm_ref_6xk
#define PACKM_8XK_KERNEL packm_ref_8xk
#define PACKM_10XK_KERNEL packm_ref_10xk
#define PACKM_12XK_KERNEL packm_ref_12xk
#define PACKM_14XK_KERNEL packm_ref_14xk
#define PACKM_16XK_KERNEL packm_ref_16xk
// -- unpackm --
#define UNPACKM_2XK_KERNEL unpackm_ref_2xk
#define UNPACKM_4XK_KERNEL unpackm_ref_4xk
#define UNPACKM_6XK_KERNEL unpackm_ref_6xk
#define UNPACKM_8XK_KERNEL unpackm_ref_8xk
#define UNPACKM_10XK_KERNEL unpackm_ref_10xk
#define UNPACKM_12XK_KERNEL unpackm_ref_12xk
#define UNPACKM_14XK_KERNEL unpackm_ref_14xk
#define UNPACKM_16XK_KERNEL unpackm_ref_16xk
// -- LEVEL-1F KERNEL DEFINITIONS ----------------------------------------------
// -- axpy2v --
#define AXPY2V_KERNEL axpy2v_unb_var1
// -- dotaxpyv --
#define DOTAXPYV_KERNEL dotaxpyv_unb_var1
// -- axpyf --
#define AXPYF_KERNEL axpyf_unb_var1
// -- dotxf --
#define DOTXF_KERNEL dotxf_unb_var1
// -- dotxaxpyf --
#define DOTXAXPYF_KERNEL dotxaxpyf_unb_var1
// -- LEVEL-1V KERNEL DEFINITIONS ----------------------------------------------
// -- addv --
#define ADDV_KERNEL addv_unb_var1
// -- axpyv --
#define AXPYV_KERNEL axpyv_unb_var1
// -- copyv --
#define COPYV_KERNEL copyv_unb_var1
// -- dotv --
#define DOTV_KERNEL dotv_unb_var1
// -- dotxv --
#define DOTXV_KERNEL dotxv_unb_var1
// -- invertv --
#define INVERTV_KERNEL invertv_unb_var1
// -- scal2v --
#define SCAL2V_KERNEL scal2v_unb_var1
// -- scalv --
#define SCALV_KERNEL scalv_unb_var1
// -- setv --
#define SETV_KERNEL setv_unb_var1
// -- subv --
#define SUBV_KERNEL subv_unb_var1
// -- swapv --
#define SWAPV_KERNEL swapv_unb_var1
#endif

View File

@@ -0,0 +1,502 @@
#define REALNAME bli_cgemm_kernel_2x2
#define STACKSIZE 256
#define K r0
#define PTR_ALPHA r1
#define OLD_A r2
#define OLD_B r3
#define PTR_BETA [fp, #0 ]
#define OLD_C [fp, #4 ]
#define OLD_RSC [fp, #8 ]
#define OLD_CSC [fp, #12 ]
#define AUX [fp, #16 ]
/******************************************************
* [fp, #-128] - [fp, #-64] is reserved
* for store and restore of floating point
* register
*******************************************************/
#define L r2
#define AO r5
#define BO r6
#define CO1 r7
#define CO2 r8
#define A_PRE 96
#define B_PRE 96
#define C_PRE 0
/**************************************************************************************
* Macro definitions
**************************************************************************************/
#define FMAC_BR fnmacs
#define FMAC_BI fmacs
#define NN 1
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
#define FADD_R fsubs
#define FADD_I fadds
#define FMAC_R1 fnmacs
#define FMAC_R2 fnmacs
#define FMAC_I1 fmacs
#define FMAC_I2 fnmacs
#elif defined(CN) || defined(CT)
#define FADD_R fadds
#define FADD_I fsubs
#define FMAC_R1 fmacs
#define FMAC_R2 fmacs
#define FMAC_I1 fnmacs
#define FMAC_I2 fmacs
#elif defined(NC) || defined(TC)
#define FADD_R fadds
#define FADD_I fsubs
#define FMAC_R1 fmacs
#define FMAC_R2 fnmacs
#define FMAC_I1 fmacs
#define FMAC_I2 fmacs
#else
#define FADD_R fsubs
#define FADD_I fadds
#define FMAC_R1 fnmacs
#define FMAC_R2 fmacs
#define FMAC_I1 fnmacs
#define FMAC_I2 fnmacs
#endif
.macro INIT2x2
vsub.f32 s16 , s16 , s16
vmov.f32 s17, s16
vmov.f32 s18, s16
vmov.f32 s19, s16
vmov.f32 s20, s16
vmov.f32 s21, s16
vmov.f32 s22, s16
vmov.f32 s23, s16
vmov.f32 s24, s16
vmov.f32 s25, s16
vmov.f32 s26, s16
vmov.f32 s27, s16
vmov.f32 s28, s16
vmov.f32 s29, s16
vmov.f32 s30, s16
vmov.f32 s31, s16
.endm
.macro KERNEL2x2_I
pld [ AO , #A_PRE ]
pld [ BO , #B_PRE ]
flds s0 , [ AO ]
flds s1 , [ AO, #4 ]
flds s8 , [ BO ]
flds s9 , [ BO, #4 ]
fmuls s16 , s0, s8
flds s2 , [ AO, #8 ]
fmuls s24 , s1, s9
flds s3 , [ AO, #12 ]
fmuls s17 , s0, s9
flds s10, [ BO, #8 ]
fmuls s25 , s1, s8
flds s11, [ BO, #12 ]
fmuls s18 , s2, s8
add BO , BO, #16
fmuls s26 , s3, s9
add AO , AO, #16
fmuls s19 , s2, s9
pld [ BO , #B_PRE ]
fmuls s27 , s3, s8
pld [ AO , #A_PRE ]
fmuls s20 , s0, s10
flds s4 , [ AO, #0 ]
fmuls s28 , s1, s11
flds s5 , [ AO, #4 ]
fmuls s21 , s0, s11
flds s12, [ BO ]
fmuls s29 , s1, s10
flds s13, [ BO, #4 ]
fmuls s22 , s2, s10
flds s6 , [ AO, #8 ]
fmuls s30 , s3, s11
flds s7 , [ AO, #12 ]
fmuls s23 , s2, s11
flds s14, [ BO, #8 ]
fmuls s31 , s3, s10
flds s15, [ BO, #12 ]
add BO , BO, #16
add AO , AO, #16
.endm
.macro KERNEL2x2_M1
pld [ AO , #A_PRE ]
fmacs s16 , s0, s8
pld [ BO , #B_PRE ]
fmacs s24 , s1, s9
flds s4 , [ AO, #0 ]
fmacs s17 , s0, s9
flds s5 , [ AO, #4 ]
fmacs s25 , s1, s8
flds s12, [ BO ]
fmacs s18 , s2, s8
flds s13, [ BO, #4 ]
fmacs s26 , s3, s9
flds s6 , [ AO, #8 ]
fmacs s19 , s2, s9
flds s7 , [ AO, #12 ]
fmacs s27 , s3, s8
fmacs s20 , s0, s10
flds s14, [ BO, #8 ]
fmacs s28 , s1, s11
fmacs s21 , s0, s11
flds s15, [ BO, #12 ]
fmacs s29 , s1, s10
fmacs s22 , s2, s10
add BO , BO, #16
fmacs s30 , s3, s11
fmacs s23 , s2, s11
add AO , AO, #16
fmacs s31 , s3, s10
.endm
.macro KERNEL2x2_M2
fmacs s16 , s4, s12
fmacs s24 , s5, s13
flds s0 , [ AO, #0 ]
fmacs s17 , s4, s13
flds s1 , [ AO, #4 ]
fmacs s25 , s5, s12
fmacs s18 , s6, s12
flds s8 , [ BO ]
fmacs s26 , s7, s13
flds s9 , [ BO, #4 ]
fmacs s19 , s6, s13
fmacs s27 , s7, s12
flds s2 , [ AO, #8 ]
fmacs s20 , s4, s14
flds s3 , [ AO, #12 ]
fmacs s28 , s5, s15
fmacs s21 , s4, s15
flds s10, [ BO, #8 ]
fmacs s29 , s5, s14
flds s11, [ BO, #12 ]
fmacs s22 , s6, s14
fmacs s30 , s7, s15
add BO , BO, #16
fmacs s23 , s6, s15
add AO , AO, #16
fmacs s31 , s7, s14
.endm
.macro KERNEL2x2_E
fmacs s16 , s4, s12
fmacs s24 , s5, s13
fmacs s17 , s4, s13
fmacs s25 , s5, s12
fmacs s18 , s6, s12
fmacs s26 , s7, s13
fmacs s19 , s6, s13
fmacs s27 , s7, s12
fmacs s20 , s4, s14
fmacs s28 , s5, s15
fmacs s21 , s4, s15
fmacs s29 , s5, s14
fmacs s22 , s6, s14
fmacs s30 , s7, s15
fmacs s23 , s6, s15
fmacs s31 , s7, s14
.endm
.macro KERNEL2x2_SUB
flds s0 , [ AO ]
flds s1 , [ AO, #4 ]
flds s8 , [ BO ]
flds s9 , [ BO, #4 ]
fmacs s16 , s0, s8
flds s2 , [ AO, #8 ]
fmacs s24 , s1, s9
flds s3 , [ AO, #12 ]
fmacs s17 , s0, s9
flds s10, [ BO, #8 ]
fmacs s25 , s1, s8
flds s11, [ BO, #12 ]
fmacs s18 , s2, s8
fmacs s26 , s3, s9
fmacs s19 , s2, s9
fmacs s27 , s3, s8
fmacs s20 , s0, s10
fmacs s28 , s1, s11
fmacs s21 , s0, s11
fmacs s29 , s1, s10
fmacs s22 , s2, s10
add BO , BO, #16
fmacs s30 , s3, s11
fmacs s23 , s2, s11
add AO , AO, #16
fmacs s31 , s3, s10
.endm
.macro SAVE2x2
ldr r3, OLD_RSC // Row stride size
lsl r3, r3, #3 // multiply with size of complex float
flds s0, [ PTR_ALPHA ] // load real part of alpha
flds s1, [ PTR_ALPHA, #4 ] // load imag part of alpha
ldr r4, PTR_BETA
flds s2, [ r4 ] // load real part of beta
flds s3, [ r4, #4 ] // load imag part of beta
// Add/Sub the real and the imag parts
FADD_R s16, s24 , s16
FADD_I s17, s25 , s17
FADD_R s18, s26 , s18
FADD_I s19, s27 , s19
FADD_R s20, s28 , s20
FADD_I s21, s29 , s21
FADD_R s22, s30 , s22
FADD_I s23, s31 , s23
mov r4, CO1 // save pointer
fldmias CO1, { s4 - s5 } // read real and imag part from C
add CO1, CO1, r3
mov r2, CO2 // save pointer
fldmias CO2, { s8 - s9 } // read real and imag part from C
add CO2, CO2, r3
fmuls s24, s4, s2 // multiply Beta-real with C-real
fmuls s25, s5, s2 // multiply Beta-real with C-imag
fmuls s28, s8, s2 // multiply Beta-real with C-real
fmuls s29, s9, s2 // multiply Beta-real with C-imag
FMAC_BR s24, s3, s5 // multiply beta-imag with C-imag and add
FMAC_BI s25, s3, s4 // multiply beta-imag with C-real and add
FMAC_BR s28, s3, s9 // multiply beta-imag with C-imag and add
FMAC_BI s29, s3, s8 // multiply beta-imag with C-real and add
FMAC_R1 s24 , s0 , s16
FMAC_I1 s25 , s0 , s17
FMAC_R2 s24 , s1 , s17
FMAC_I2 s25 , s1 , s16
FMAC_R1 s28 , s0 , s20
FMAC_I1 s29 , s0 , s21
FMAC_R2 s28 , s1 , s21
FMAC_I2 s29 , s1 , s20
fldmias CO1, { s4 - s5 } // read real and imag part from C
fldmias CO2, { s8 - s9 } // read real and imag part from C
fmuls s26, s4, s2 // multiply Beta-real with C-real
fmuls s27, s5, s2 // multiply Beta-real with C-imag
fmuls s30, s8, s2 // multiply Beta-real with C-real
fmuls s31, s9, s2 // multiply Beta-real with C-imag
FMAC_BR s26, s3, s5 // multiply beta-imag with C-imag and add
FMAC_BI s27, s3, s4 // multiply beta-imag with C-real and add
FMAC_BR s30, s3, s9 // multiply beta-imag with C-imag and add
FMAC_BI s31, s3, s8 // multiply beta-imag with C-real and add
FMAC_R1 s26 , s0 , s18
FMAC_I1 s27 , s0 , s19
FMAC_R2 s26 , s1 , s19
FMAC_I2 s27 , s1 , s18
FMAC_R1 s30, s0 , s22
FMAC_I1 s31, s0 , s23
FMAC_R2 s30, s1 , s23
FMAC_I2 s31, s1 , s22
mov CO1, r4 // restore pointer
mov CO2, r2 // restore pointer
fstmias CO1, { s24 - s25 }
fstmias CO2, { s28 - s29 }
add CO1, CO1, r3
add CO2, CO2, r3
fstmias CO1, { s26 - s27 }
fstmias CO2, { s30 - s31 }
.endm
/**************************************************************************************
* End of macro definitions
**************************************************************************************/
.arm
.global REALNAME
.func REALNAME
REALNAME:
push {r4 - r9, fp} // save register
add fp, sp, #28 // add number of saved register multiplied by size of int
sub sp, sp, #STACKSIZE // reserve stack
mov AO, OLD_A // pointer matrix A
mov BO, OLD_B // pointer matrix B
sub r3, fp, #128
vstm r3, { s8 - s31} // store floating point registers
ldr r2, OLD_C // pointer matrix C
ldr r3, OLD_CSC // Col stride size of C
lsl r3, r3, #3 // multiply with size of complex float
mov CO1, r2 // first line of C
add CO2, CO1, r3 // second line of C
pld [ CO1, #C_PRE ] // prefetch the lines of C
pld [ CO2, #C_PRE ] // prefetch the lines of C
cgemm_kernel_L2_M2_20:
asrs L , K, #3 // L = K / 8
cmp L , #2
blt cgemm_kernel_L2_M2_32
KERNEL2x2_I
KERNEL2x2_M2
KERNEL2x2_M1
KERNEL2x2_M2
KERNEL2x2_M1
KERNEL2x2_M2
KERNEL2x2_M1
KERNEL2x2_M2
subs L, L, #2
ble cgemm_kernel_L2_M2_22a
.align 5
cgemm_kernel_L2_M2_22:
KERNEL2x2_M1
KERNEL2x2_M2
KERNEL2x2_M1
KERNEL2x2_M2
KERNEL2x2_M1
KERNEL2x2_M2
KERNEL2x2_M1
KERNEL2x2_M2
subs L, L, #1
bgt cgemm_kernel_L2_M2_22
cgemm_kernel_L2_M2_22a:
KERNEL2x2_M1
KERNEL2x2_M2
KERNEL2x2_M1
KERNEL2x2_M2
KERNEL2x2_M1
KERNEL2x2_M2
KERNEL2x2_M1
KERNEL2x2_E
b cgemm_kernel_L2_M2_44
cgemm_kernel_L2_M2_32:
tst L, #1
ble cgemm_kernel_L2_M2_40
KERNEL2x2_I
KERNEL2x2_M2
KERNEL2x2_M1
KERNEL2x2_M2
KERNEL2x2_M1
KERNEL2x2_M2
KERNEL2x2_M1
KERNEL2x2_E
b cgemm_kernel_L2_M2_44
cgemm_kernel_L2_M2_40:
INIT2x2
cgemm_kernel_L2_M2_44:
ands L , K, #7 // L = K % 8
ble cgemm_kernel_L2_M2_100
cgemm_kernel_L2_M2_46:
KERNEL2x2_SUB
subs L, L, #1
bne cgemm_kernel_L2_M2_46
cgemm_kernel_L2_M2_100:
SAVE2x2
cgemm_kernel_L999:
sub r3, fp, #128
vldm r3, { s8 - s31} // restore floating point registers
sub sp, fp, #28
pop {r4 - r9, fp}
bx lr

View File

@@ -0,0 +1,503 @@
#define REALNAME bli_dgemm_kernel_4x4
#define STACKSIZE 256
#define K r0
#define PTR_ALPHA r1
#define OLD_A r2
#define OLD_B r3
#define PTR_BETA [fp, #0 ]
#define OLD_C [fp, #4 ]
#define OLD_RSC [fp, #8 ]
#define OLD_CSC [fp, #12 ]
#define AUX [fp, #16 ]
/******************************************************
* [fp, #-128] - [fp, #-64] is reserved
* for store and restore of floating point
* register
*******************************************************/
#define L r2
#define AO r5
#define BO r6
#define CO1 r7
#define CO2 r8
#define CO3 r9
#define CO4 r12
#define A_PRE 96
#define B_PRE 96
#define C_PRE 0
/**************************************************************************************
* Macro definitions
**************************************************************************************/
.macro INIT4x4
vsub.f64 d16 , d16 , d16
vmov.f64 d17, d16
vmov.f64 d18, d16
vmov.f64 d19, d16
vmov.f64 d20, d16
vmov.f64 d21, d16
vmov.f64 d22, d16
vmov.f64 d23, d16
vmov.f64 d24, d16
vmov.f64 d25, d16
vmov.f64 d26, d16
vmov.f64 d27, d16
vmov.f64 d28, d16
vmov.f64 d29, d16
vmov.f64 d30, d16
vmov.f64 d31, d16
.endm
.macro KERNEL4x4_I
pld [ BO , #B_PRE ]
fldd d8 , [ BO ]
fldd d0 , [ AO ]
pld [ AO , #A_PRE ]
fldd d1 , [ AO, #8 ]
fmuld d16 , d0, d8
fldd d2 , [ AO, #16 ]
fmuld d17 , d1, d8
fldd d3 , [ AO, #24 ]
fmuld d18 , d2, d8
fldd d9 , [ BO, #8 ]
fmuld d19 , d3, d8
fldd d10, [ BO, #16 ]
fmuld d20 , d0, d9
fldd d11, [ BO, #24 ]
fmuld d21 , d1, d9
add BO , BO, #32
add AO , AO, #32
fmuld d22 , d2, d9
pld [ BO , #B_PRE ]
fldd d12, [ BO ]
fmuld d23 , d3, d9
pld [ AO , #A_PRE ]
fldd d4 , [ AO, #0 ]
fmuld d24 , d0, d10
fldd d5 , [ AO, #8 ]
fmuld d25 , d1, d10
fldd d6 , [ AO, #16 ]
fmuld d26 , d2, d10
fldd d7 , [ AO, #24 ]
fmuld d27 , d3, d10
fldd d13, [ BO, #8 ]
fmuld d28 , d0, d11
fldd d14, [ BO, #16 ]
fmuld d29 , d1, d11
fldd d15, [ BO, #24 ]
fmuld d30 , d2, d11
fmuld d31 , d3, d11
.endm
.macro KERNEL4x4_M2
fmacd d16 , d4, d12
pld [ AO , #A_PRE+32 ]
fmacd d17 , d5, d12
fldd d0 , [ AO , #32 ]
fmacd d18 , d6, d12
pld [ BO , #B_PRE+32 ]
fmacd d19 , d7, d12
fldd d8 , [ BO , #32 ]
fmacd d20 , d4, d13
fldd d1 , [ AO, #40 ]
fmacd d21 , d5, d13
fldd d2 , [ AO, #48 ]
fmacd d22 , d6, d13
fldd d3 , [ AO, #56 ]
fmacd d23 , d7, d13
fmacd d24 , d4, d14
fmacd d25 , d5, d14
fldd d9 , [ BO, #40 ]
fmacd d26 , d6, d14
fldd d10, [ BO, #48 ]
fmacd d27 , d7, d14
fldd d11, [ BO, #56 ]
fmacd d28 , d4, d15
fmacd d29 , d5, d15
add AO , AO, #64
fmacd d30 , d6, d15
add BO , BO, #64
fmacd d31 , d7, d15
.endm
.macro KERNEL4x4_M1
fmacd d16 , d0, d8
pld [ AO , #A_PRE ]
fmacd d17 , d1, d8
fldd d4 , [ AO ]
fmacd d18 , d2, d8
pld [ BO , #B_PRE ]
fmacd d19 , d3, d8
fldd d12, [ BO ]
fmacd d20 , d0, d9
fldd d5 , [ AO, #8 ]
fmacd d21 , d1, d9
fldd d6 , [ AO, #16 ]
fmacd d22 , d2, d9
fldd d7 , [ AO, #24 ]
fmacd d23 , d3, d9
fmacd d24 , d0, d10
fmacd d25 , d1, d10
fldd d13, [ BO, #8 ]
fmacd d26 , d2, d10
fldd d14, [ BO, #16 ]
fmacd d27 , d3, d10
fldd d15, [ BO, #24 ]
fmacd d28 , d0, d11
fmacd d29 , d1, d11
fmacd d30 , d2, d11
fmacd d31 , d3, d11
.endm
.macro KERNEL4x4_E
fmacd d16 , d4, d12
fmacd d17 , d5, d12
add BO , BO, #32
fmacd d18 , d6, d12
add AO , AO, #32
fmacd d19 , d7, d12
fmacd d20 , d4, d13
fmacd d21 , d5, d13
fmacd d22 , d6, d13
fmacd d23 , d7, d13
fmacd d24 , d4, d14
fmacd d25 , d5, d14
fmacd d26 , d6, d14
fmacd d27 , d7, d14
fmacd d28 , d4, d15
fmacd d29 , d5, d15
fmacd d30 , d6, d15
fmacd d31 , d7, d15
.endm
.macro KERNEL4x4_SUB
fldd d8 , [ BO ]
pld [ BO , #B_PRE ]
fldd d0 , [ AO ]
pld [ AO , #A_PRE ]
fldd d1 , [ AO, #8 ]
fmacd d16 , d0, d8
fldd d2 , [ AO, #16 ]
fmacd d17 , d1, d8
fldd d3 , [ AO, #24 ]
fmacd d18 , d2, d8
fldd d9 , [ BO, #8 ]
fmacd d19 , d3, d8
fldd d10, [ BO, #16 ]
fmacd d20 , d0, d9
fldd d11, [ BO, #24 ]
fmacd d21 , d1, d9
fmacd d22 , d2, d9
fmacd d23 , d3, d9
fmacd d24 , d0, d10
fmacd d25 , d1, d10
fmacd d26 , d2, d10
fmacd d27 , d3, d10
fmacd d28 , d0, d11
fmacd d29 , d1, d11
add AO , AO, #32
fmacd d30 , d2, d11
add BO , BO, #32
fmacd d31 , d3, d11
.endm
.macro SAVE4x4
ldr r3, OLD_RSC // Row stride size
lsl r3, r3, #3 // multiply with size of double
fldd d0, [ PTR_ALPHA ] // load alpha
ldr r4, PTR_BETA
fldd d1, [ r4 ] // load beta
//-----------------------------------------------------------
mov r2, CO1 // save pointer
mov r4, CO2 // save pointer
fldd d8, [ CO1 ] // load value from C
fldd d12, [ CO2 ] // load value from C
fmuld d8, d8, d1 // multiply with beta
add CO1, CO1, r3 // compute next pointer
fmacd d8, d0, d16 // multiply sum with alpha and add to value of C
add CO2, CO2, r3 // compute next pointer
fldd d9, [ CO1 ] // load value from C
fldd d13, [ CO2 ] // load value from C
fmuld d9, d9, d1 // multiply with beta
add CO1, CO1, r3 // compute next pointer
fmacd d9, d0, d17 // multiply sum with alpha and add to value of C
add CO2, CO2, r3 // compute next pointer
fldd d10, [ CO1 ] // load value from C
fldd d14, [ CO2 ] // load value from C
fmuld d10, d10, d1 // multiply with beta
add CO1, CO1, r3 // compute next pointer
fmacd d10, d0, d18 // multiply sum with alpha and add to value of C
add CO2, CO2, r3 // compute next pointer
fldd d11, [ CO1 ] // load value from C
fldd d15, [ CO2 ] // load value from C
fmuld d11, d11, d1 // multiply with beta
mov CO1, r2 // restore pointer
fmacd d11, d0, d19 // multiply sum with alpha and add to value of C
mov CO2, r4 // restore pointer
fstd d8, [ CO1 ] // store value in C
add CO1 , CO1, r3 // compute next pointer
fstd d9, [ CO1 ] // store value in C
add CO1 , CO1, r3 // compute next pointer
fstd d10, [ CO1 ] // store value in C
add CO1 , CO1, r3 // compute next pointer
fstd d11, [ CO1 ] // store value in C
//-----------------------------------------------------------
mov r2, CO3 // save pointer
fldd d8, [ CO3 ] // load value from C
fmuld d12, d12, d1 // multiply with beta
add CO3, CO3, r3 // compute next pointer
fmacd d12, d0, d20 // multiply sum with alpha and add to value of C
fldd d9, [ CO3 ] // load value from C
fmuld d13, d13, d1 // multiply with beta
add CO3, CO3, r3 // compute next pointer
fmacd d13, d0, d21 // multiply sum with alpha and add to value of C
fldd d10, [ CO3 ] // load value from C
fmuld d14, d14, d1 // multiply with beta
add CO3, CO3, r3 // compute next pointer
fmacd d14, d0, d22 // multiply sum with alpha and add to value of C
fldd d11, [ CO3 ] // load value from C
fmuld d15, d15, d1 // multiply with beta
mov CO3, r2 // restore pointer
fmacd d15, d0, d23 // multiply sum with alpha and add to value of C
fstd d12, [ CO2 ] // store value in C
add CO2 , CO2, r3 // compute next pointer
fstd d13, [ CO2 ] // store value in C
add CO2 , CO2, r3 // compute next pointer
fstd d14, [ CO2 ] // store value in C
add CO2 , CO2, r3 // compute next pointer
fstd d15, [ CO2 ] // store value in C
//-----------------------------------------------------------
mov r4, CO4 // save pointer
fldd d12, [ CO4 ] // load value from C
fmuld d8, d8, d1 // multiply with beta
add CO4, CO4, r3 // compute next pointer
fmacd d8, d0, d24 // multiply sum with alpha and add to value of C
fldd d13, [ CO4 ] // load value from C
fmuld d9, d9, d1 // multiply with beta
add CO4, CO4, r3 // compute next pointer
fmacd d9, d0, d25 // multiply sum with alpha and add to value of C
fldd d14, [ CO4 ] // load value from C
fmuld d10, d10, d1 // multiply with beta
add CO4, CO4, r3 // compute next pointer
fmacd d10, d0, d26 // multiply sum with alpha and add to value of C
fldd d15, [ CO4 ] // load value from C
fmuld d11, d11, d1 // multiply with beta
mov CO4, r4 // restore pointer
fmacd d11, d0, d27 // multiply sum with alpha and add to value of C
//-----------------------------------------------------------
fstd d8, [ CO3 ] // store value in C
fmuld d12, d12, d1 // multiply with beta
add CO3 , CO3, r3 // compute next pointer
fmacd d12, d0, d28 // multiply sum with alpha and add to value of C
fstd d9, [ CO3 ] // store value in C
fmuld d13, d13, d1 // multiply with beta
add CO3 , CO3, r3 // compute next pointer
fmacd d13, d0, d29 // multiply sum with alpha and add to value of C
fstd d10, [ CO3 ] // store value in C
fmuld d14, d14, d1 // multiply with beta
add CO3 , CO3, r3 // compute next pointer
fmacd d14, d0, d30 // multiply sum with alpha and add to value of C
fstd d11, [ CO3 ] // store value in C
fmuld d15, d15, d1 // multiply with beta
fstd d12, [ CO4 ] // store value in C
fmacd d15, d0, d31 // multiply sum with alpha and add to value of C
add CO4 , CO4, r3 // compute next pointer
fstd d13, [ CO4 ] // store value in C
add CO4 , CO4, r3 // compute next pointer
fstd d14, [ CO4 ] // store value in C
add CO4 , CO4, r3 // compute next pointer
fstd d15, [ CO4 ] // store value in C
.endm
/**************************************************************************************
* End of macro definitions
**************************************************************************************/
.arm
.global REALNAME
.func REALNAME
REALNAME:
push {r4 - r9, fp} // save register
add fp, sp, #28 // add number of saved register multiplied by size of int
sub sp, sp, #STACKSIZE // reserve stack
mov AO, OLD_A // pointer matrix A
mov BO, OLD_B // pointer matrix B
sub r3, fp, #128
vstm r3, { d8 - d15} // store floating point registers
ldr r2, OLD_C // pointer matrix C
ldr r3, OLD_CSC // Col stride size of C
lsl r3, r3, #3 // multiply with size of double
mov CO1, r2 // first line of C
add CO2, CO1, r3 // second line of C
add CO3, CO2, r3 // third line of C
add CO4, CO3, r3 // fourth line of C
pld [ CO1, #C_PRE ] // prefetch the lines of C
pld [ CO2, #C_PRE ] // prefetch the lines of C
pld [ CO3, #C_PRE ] // prefetch the lines of C
pld [ CO3, #C_PRE ] // prefetch the lines of C
dgemm_kernel_L4_M4_20:
asrs L , K, #3 // L = K / 8
cmp L , #2
blt dgemm_kernel_L4_M4_32
KERNEL4x4_I
KERNEL4x4_M2
KERNEL4x4_M1
KERNEL4x4_M2
KERNEL4x4_M1
KERNEL4x4_M2
KERNEL4x4_M1
KERNEL4x4_M2
subs L, L, #2
ble dgemm_kernel_L4_M4_22a
.align 5
dgemm_kernel_L4_M4_22:
KERNEL4x4_M1
KERNEL4x4_M2
KERNEL4x4_M1
KERNEL4x4_M2
KERNEL4x4_M1
KERNEL4x4_M2
KERNEL4x4_M1
KERNEL4x4_M2
subs L, L, #1
bgt dgemm_kernel_L4_M4_22
dgemm_kernel_L4_M4_22a:
KERNEL4x4_M1
KERNEL4x4_M2
KERNEL4x4_M1
KERNEL4x4_M2
KERNEL4x4_M1
KERNEL4x4_M2
KERNEL4x4_M1
KERNEL4x4_E
b dgemm_kernel_L4_M4_44
dgemm_kernel_L4_M4_32:
tst L, #1
ble dgemm_kernel_L4_M4_40
KERNEL4x4_I
KERNEL4x4_M2
KERNEL4x4_M1
KERNEL4x4_M2
KERNEL4x4_M1
KERNEL4x4_M2
KERNEL4x4_M1
KERNEL4x4_E
b dgemm_kernel_L4_M4_44
dgemm_kernel_L4_M4_40:
INIT4x4
dgemm_kernel_L4_M4_44:
ands L , K, #7 // L = K % 8
ble dgemm_kernel_L4_M4_100
dgemm_kernel_L4_M4_46:
KERNEL4x4_SUB
subs L, L, #1
bne dgemm_kernel_L4_M4_46
dgemm_kernel_L4_M4_100:
SAVE4x4
dgemm_kernel_L999:
sub r3, fp, #128
vldm r3, { d8 - d15} // restore floating point registers
sub sp, fp, #28
pop {r4 - r9, fp}
bx lr

View File

@@ -0,0 +1,134 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2012, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
extern void bli_sgemm_kernel_4x4(dim_t k,
float* alpha,
float* restrict a,
float* restrict b,
float* beta,
float* restrict c, inc_t rs_c, inc_t cs_c,
auxinfo_t* data
);
void bli_sgemm_opt_4x4(
dim_t k,
float* restrict alpha,
float* restrict a,
float* restrict b,
float* restrict beta,
float* restrict c, inc_t rs_c, inc_t cs_c,
auxinfo_t* data
)
{
bli_sgemm_kernel_4x4(k, alpha, a, b, beta, c, rs_c, cs_c, data);
}
extern void bli_dgemm_kernel_4x4(dim_t k,
double* alpha,
double* restrict a,
double* restrict b,
double* beta,
double* restrict c, inc_t rs_c, inc_t cs_c,
auxinfo_t* data
);
void bli_dgemm_opt_4x4(
dim_t k,
double* restrict alpha,
double* restrict a,
double* restrict b,
double* restrict beta,
double* restrict c, inc_t rs_c, inc_t cs_c,
auxinfo_t* data
)
{
bli_dgemm_kernel_4x4(k, alpha, a, b, beta, c, rs_c, cs_c, data);
}
extern void bli_cgemm_kernel_2x2(dim_t k,
scomplex* alpha,
scomplex* restrict a,
scomplex* restrict b,
scomplex* beta,
scomplex* restrict c, inc_t rs_c, inc_t cs_c,
auxinfo_t* data
);
void bli_cgemm_opt_4x4(
dim_t k,
scomplex* restrict alpha,
scomplex* restrict a,
scomplex* restrict b,
scomplex* restrict beta,
scomplex* restrict c, inc_t rs_c, inc_t cs_c,
auxinfo_t* data
)
{
bli_cgemm_kernel_2x2(k, alpha, a, b, beta, c, rs_c, cs_c, data);
}
extern void bli_zgemm_kernel_2x2(dim_t k,
dcomplex* alpha,
dcomplex* restrict a,
dcomplex* restrict b,
dcomplex* beta,
dcomplex* restrict c, inc_t rs_c, inc_t cs_c,
auxinfo_t* data
);
void bli_zgemm_opt_4x4(
dim_t k,
dcomplex* restrict alpha,
dcomplex* restrict a,
dcomplex* restrict b,
dcomplex* restrict beta,
dcomplex* restrict c, inc_t rs_c, inc_t cs_c,
auxinfo_t* data
)
{
bli_zgemm_kernel_2x2(k, alpha, a, b, beta, c, rs_c, cs_c, data);
}

View File

@@ -0,0 +1,53 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
// #include "arm_neon.h"
#undef GENTPROT
#define GENTPROT( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname)( \
dim_t k, \
ctype* restrict alpha, \
ctype* restrict a, \
ctype* restrict b, \
ctype* restrict beta, \
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
auxinfo_t* data \
);
INSERT_GENTPROT_BASIC( gemm_opt_4x4 )

View File

@@ -0,0 +1,483 @@
#define REALNAME bli_sgemm_kernel_4x4
#define STACKSIZE 256
#define K r0
#define PTR_ALPHA r1
#define OLD_A r2
#define OLD_B r3
#define PTR_BETA [fp, #0 ]
#define OLD_C [fp, #4 ]
#define OLD_RSC [fp, #8 ]
#define OLD_CSC [fp, #12 ]
#define AUX [fp, #16 ]
/******************************************************
* [fp, #-128] - [fp, #-64] is reserved
* for store and restore of floating point
* register
*******************************************************/
#define L r2
#define AO r5
#define BO r6
#define CO1 r7
#define CO2 r8
#define CO3 r9
#define CO4 r12
#define A_PRE 96
#define B_PRE 96
#define C_PRE 0
/**************************************************************************************
* Macro definitions
**************************************************************************************/
.macro INIT4x4
vsub.f32 s16 , s16 , s16
vmov.f32 s17, s16
vmov.f32 s18, s16
vmov.f32 s19, s16
vmov.f32 s20, s16
vmov.f32 s21, s16
vmov.f32 s22, s16
vmov.f32 s23, s16
vmov.f32 s24, s16
vmov.f32 s25, s16
vmov.f32 s26, s16
vmov.f32 s27, s16
vmov.f32 s28, s16
vmov.f32 s29, s16
vmov.f32 s30, s16
vmov.f32 s31, s16
.endm
.macro KERNEL4x4_I
pld [ AO , #A_PRE ]
fldmias AO!, { s0 - s1 }
pld [ BO , #B_PRE ]
fldmias BO!, { s8 - s9 }
fmuls s16 , s0, s8
fldmias AO!, { s2 - s3 }
fmuls s17 , s1, s8
fmuls s18 , s2, s8
fldmias BO!, { s10 - s11 }
fmuls s19 , s3, s8
fmuls s20 , s0, s9
fldmias AO!, { s4 - s5 }
fmuls s21 , s1, s9
fmuls s22 , s2, s9
fldmias AO!, { s6 - s7 }
fmuls s23 , s3, s9
fmuls s24 , s0, s10
fldmias BO!, { s12 - s13 }
fmuls s25 , s1, s10
fmuls s26 , s2, s10
fldmias BO!, { s14 - s15 }
fmuls s27 , s3, s10
fmuls s28 , s0, s11
fmuls s29 , s1, s11
fmuls s30 , s2, s11
fmuls s31 , s3, s11
.endm
.macro KERNEL4x4_M2
pld [ AO , #A_PRE ]
fmacs s16 , s4, s12
fmacs s17 , s5, s12
fldmias AO!, { s0 - s3 }
fmacs s18 , s6, s12
pld [ BO , #B_PRE ]
fmacs s19 , s7, s12
fmacs s20 , s4, s13
fldmias BO!, { s8 - s11 }
fmacs s21 , s5, s13
fmacs s22 , s6, s13
//fldmias AO!, { s2 - s3 }
fmacs s23 , s7, s13
fmacs s24 , s4, s14
//fldmias BO!, { s10 - s11 }
fmacs s25 , s5, s14
fmacs s26 , s6, s14
fmacs s27 , s7, s14
fmacs s28 , s4, s15
fmacs s29 , s5, s15
fmacs s30 , s6, s15
fmacs s31 , s7, s15
.endm
.macro KERNEL4x4_M1
fmacs s16 , s0, s8
fldmias AO!, { s4 - s7 }
fmacs s17 , s1, s8
fmacs s18 , s2, s8
fldmias BO!, { s12 - s15 }
//fldmias AO!, { s6 - s7 }
fmacs s19 , s3, s8
fmacs s20 , s0, s9
fmacs s21 , s1, s9
fmacs s22 , s2, s9
//fldmias BO!, { s14 - s15 }
fmacs s23 , s3, s9
fmacs s24 , s0, s10
fmacs s25 , s1, s10
fmacs s26 , s2, s10
fmacs s27 , s3, s10
fmacs s28 , s0, s11
fmacs s29 , s1, s11
fmacs s30 , s2, s11
fmacs s31 , s3, s11
.endm
.macro KERNEL4x4_E
fmacs s16 , s4, s12
fmacs s17 , s5, s12
fmacs s18 , s6, s12
fmacs s19 , s7, s12
fmacs s20 , s4, s13
fmacs s21 , s5, s13
fmacs s22 , s6, s13
fmacs s23 , s7, s13
fmacs s24 , s4, s14
fmacs s25 , s5, s14
fmacs s26 , s6, s14
fmacs s27 , s7, s14
fmacs s28 , s4, s15
fmacs s29 , s5, s15
fmacs s30 , s6, s15
fmacs s31 , s7, s15
.endm
.macro KERNEL4x4_SUB
flds s8 , [ BO ]
flds s0 , [ AO ]
flds s1 , [ AO, #4 ]
fmacs s16 , s0, s8
flds s2 , [ AO, #8 ]
fmacs s17 , s1, s8
flds s3 , [ AO, #12 ]
fmacs s18 , s2, s8
flds s9 , [ BO, #4 ]
fmacs s19 , s3, s8
flds s10, [ BO, #8 ]
fmacs s20 , s0, s9
flds s11, [ BO, #12 ]
fmacs s21 , s1, s9
fmacs s22 , s2, s9
fmacs s23 , s3, s9
fmacs s24 , s0, s10
fmacs s25 , s1, s10
fmacs s26 , s2, s10
fmacs s27 , s3, s10
fmacs s28 , s0, s11
fmacs s29 , s1, s11
add AO , AO, #16
fmacs s30 , s2, s11
add BO , BO, #16
fmacs s31 , s3, s11
.endm
.macro SAVE4x4
ldr r3, OLD_RSC // Row stride size
lsl r3, r3, #2 // multiply with size of float
flds s0, [ PTR_ALPHA ] // load alpha
ldr r4, PTR_BETA
flds s1, [ r4 ] // load beta
//-----------------------------------------------------------
mov r2, CO1 // save pointer
mov r4, CO2 // save pointer
flds s8, [ CO1 ] // load value from C
flds s12, [ CO2 ] // load value from C
fmuls s8, s8, s1 // multiply with beta
add CO1, CO1, r3 // compute next pointer
fmacs s8, s0, s16 // multiply sum with alpha and add to value of C
add CO2, CO2, r3 // compute next pointer
flds s9, [ CO1 ] // load value from C
flds s13, [ CO2 ] // load value from C
fmuls s9, s9, s1 // multiply with beta
add CO1, CO1, r3 // compute next pointer
fmacs s9, s0, s17 // multiply sum with alpha and add to value of C
add CO2, CO2, r3 // compute next pointer
flds s10, [ CO1 ] // load value from C
flds s14, [ CO2 ] // load value from C
fmuls s10, s10, s1 // multiply with beta
add CO1, CO1, r3 // compute next pointer
fmacs s10, s0, s18 // multiply sum with alpha and add to value of C
add CO2, CO2, r3 // compute next pointer
flds s11, [ CO1 ] // load value from C
flds s15, [ CO2 ] // load value from C
fmuls s11, s11, s1 // multiply with beta
mov CO1, r2 // restore pointer
fmacs s11, s0, s19 // multiply sum with alpha and add to value of C
mov CO2, r4 // restore pointer
fsts s8, [ CO1 ] // store value in C
add CO1 , CO1, r3 // compute next pointer
fsts s9, [ CO1 ] // store value in C
add CO1 , CO1, r3 // compute next pointer
fsts s10, [ CO1 ] // store value in C
add CO1 , CO1, r3 // compute next pointer
fsts s11, [ CO1 ] // store value in C
//-----------------------------------------------------------
mov r2, CO3 // save pointer
flds s8, [ CO3 ] // load value from C
fmuls s12, s12, s1 // multiply with beta
add CO3, CO3, r3 // compute next pointer
fmacs s12, s0, s20 // multiply sum with alpha and add to value of C
flds s9, [ CO3 ] // load value from C
fmuls s13, s13, s1 // multiply with beta
add CO3, CO3, r3 // compute next pointer
fmacs s13, s0, s21 // multiply sum with alpha and add to value of C
flds s10, [ CO3 ] // load value from C
fmuls s14, s14, s1 // multiply with beta
add CO3, CO3, r3 // compute next pointer
fmacs s14, s0, s22 // multiply sum with alpha and add to value of C
flds s11, [ CO3 ] // load value from C
fmuls s15, s15, s1 // multiply with beta
mov CO3, r2 // restore pointer
fmacs s15, s0, s23 // multiply sum with alpha and add to value of C
fsts s12, [ CO2 ] // store value in C
add CO2 , CO2, r3 // compute next pointer
fsts s13, [ CO2 ] // store value in C
add CO2 , CO2, r3 // compute next pointer
fsts s14, [ CO2 ] // store value in C
add CO2 , CO2, r3 // compute next pointer
fsts s15, [ CO2 ] // store value in C
//-----------------------------------------------------------
mov r4, CO4 // save pointer
flds s12, [ CO4 ] // load value from C
fmuls s8, s8, s1 // multiply with beta
add CO4, CO4, r3 // compute next pointer
fmacs s8, s0, s24 // multiply sum with alpha and add to value of C
flds s13, [ CO4 ] // load value from C
fmuls s9, s9, s1 // multiply with beta
add CO4, CO4, r3 // compute next pointer
fmacs s9, s0, s25 // multiply sum with alpha and add to value of C
flds s14, [ CO4 ] // load value from C
fmuls s10, s10, s1 // multiply with beta
add CO4, CO4, r3 // compute next pointer
fmacs s10, s0, s26 // multiply sum with alpha and add to value of C
flds s15, [ CO4 ] // load value from C
fmuls s11, s11, s1 // multiply with beta
mov CO4, r4 // restore pointer
fmacs s11, s0, s27 // multiply sum with alpha and add to value of C
//-----------------------------------------------------------
fsts s8, [ CO3 ] // store value in C
fmuls s12, s12, s1 // multiply with beta
add CO3 , CO3, r3 // compute next pointer
fmacs s12, s0, s28 // multiply sum with alpha and add to value of C
fsts s9, [ CO3 ] // store value in C
fmuls s13, s13, s1 // multiply with beta
add CO3 , CO3, r3 // compute next pointer
fmacs s13, s0, s29 // multiply sum with alpha and add to value of C
fsts s10, [ CO3 ] // store value in C
fmuls s14, s14, s1 // multiply with beta
add CO3 , CO3, r3 // compute next pointer
fmacs s14, s0, s30 // multiply sum with alpha and add to value of C
fsts s11, [ CO3 ] // store value in C
fmuls s15, s15, s1 // multiply with beta
fsts s12, [ CO4 ] // store value in C
fmacs s15, s0, s31 // multiply sum with alpha and add to value of C
add CO4 , CO4, r3 // compute next pointer
fsts s13, [ CO4 ] // store value in C
add CO4 , CO4, r3 // compute next pointer
fsts s14, [ CO4 ] // store value in C
add CO4 , CO4, r3 // compute next pointer
fsts s15, [ CO4 ] // store value in C
.endm
/**************************************************************************************
* End of macro definitions
**************************************************************************************/
.arm
.global REALNAME
.func REALNAME
REALNAME:
push {r4 - r9, fp} // save register
add fp, sp, #28 // add number of saved register multiplied by size of int
sub sp, sp, #STACKSIZE // reserve stack
mov AO, OLD_A // pointer matrix A
mov BO, OLD_B // pointer matrix B
sub r3, fp, #128
vstm r3, { s8 - s31 } // store floating point registers
ldr r2, OLD_C // pointer matrix C
ldr r3, OLD_CSC // Col stride size of C
lsl r3, r3, #2 // multiply with size of float
mov CO1, r2 // first line of C
add CO2, CO1, r3 // second line of C
add CO3, CO2, r3 // third line of C
add CO4, CO3, r3 // fourth line of C
pld [ CO1, #C_PRE ] // prefetch the lines of C
pld [ CO2, #C_PRE ] // prefetch the lines of C
pld [ CO3, #C_PRE ] // prefetch the lines of C
pld [ CO3, #C_PRE ] // prefetch the lines of C
sgemm_kernel_L4_M4_20:
asrs L , K, #3 // L = K / 8
cmp L , #2
blt sgemm_kernel_L4_M4_32
KERNEL4x4_I
KERNEL4x4_M2
KERNEL4x4_M1
KERNEL4x4_M2
KERNEL4x4_M1
KERNEL4x4_M2
KERNEL4x4_M1
KERNEL4x4_M2
subs L, L, #2
ble sgemm_kernel_L4_M4_22a
.align 5
sgemm_kernel_L4_M4_22:
KERNEL4x4_M1
KERNEL4x4_M2
KERNEL4x4_M1
KERNEL4x4_M2
KERNEL4x4_M1
KERNEL4x4_M2
KERNEL4x4_M1
KERNEL4x4_M2
subs L, L, #1
bgt sgemm_kernel_L4_M4_22
sgemm_kernel_L4_M4_22a:
KERNEL4x4_M1
KERNEL4x4_M2
KERNEL4x4_M1
KERNEL4x4_M2
KERNEL4x4_M1
KERNEL4x4_M2
KERNEL4x4_M1
KERNEL4x4_E
b sgemm_kernel_L4_M4_44
sgemm_kernel_L4_M4_32:
tst L, #1
ble sgemm_kernel_L4_M4_40
KERNEL4x4_I
KERNEL4x4_M2
KERNEL4x4_M1
KERNEL4x4_M2
KERNEL4x4_M1
KERNEL4x4_M2
KERNEL4x4_M1
KERNEL4x4_E
b sgemm_kernel_L4_M4_44
sgemm_kernel_L4_M4_40:
INIT4x4
sgemm_kernel_L4_M4_44:
ands L , K, #7 // L = K % 8
ble sgemm_kernel_L4_M4_100
sgemm_kernel_L4_M4_46:
KERNEL4x4_SUB
subs L, L, #1
bne sgemm_kernel_L4_M4_46
sgemm_kernel_L4_M4_100:
SAVE4x4
sgemm_kernel_L999:
sub r3, fp, #128
vldm r3, { s8 - s31 } // restore floating point registers
sub sp, fp, #28
pop {r4 - r9, fp}
bx lr

View File

@@ -0,0 +1,506 @@
#define REALNAME bli_zgemm_kernel_2x2
#define STACKSIZE 256
#define K r0
#define PTR_ALPHA r1
#define OLD_A r2
#define OLD_B r3
#define PTR_BETA [fp, #0 ]
#define OLD_C [fp, #4 ]
#define OLD_RSC [fp, #8 ]
#define OLD_CSC [fp, #12 ]
#define AUX [fp, #16 ]
/******************************************************
* [fp, #-128] - [fp, #-64] is reserved
* for store and restore of floating point
* register
*******************************************************/
#define L r2
#define AO r5
#define BO r6
#define CO1 r7
#define CO2 r8
#define A_PRE 96
#define B_PRE 96
#define C_PRE 0
/**************************************************************************************
* Macro definitions
**************************************************************************************/
#define FMAC_BR fnmacd
#define FMAC_BI fmacd
#define NN 1
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
#define FADD_R fsubd
#define FADD_I faddd
#define FMAC_R1 fnmacd
#define FMAC_R2 fnmacd
#define FMAC_I1 fmacd
#define FMAC_I2 fnmacd
#elif defined(CN) || defined(CT)
#define FADD_R faddd
#define FADD_I fsubd
#define FMAC_R1 fmacd
#define FMAC_R2 fmacd
#define FMAC_I1 fnmacd
#define FMAC_I2 fmacd
#elif defined(NC) || defined(TC)
#define FADD_R faddd
#define FADD_I fsubd
#define FMAC_R1 fmacd
#define FMAC_R2 fnmacd
#define FMAC_I1 fmacd
#define FMAC_I2 fmacd
#else
#define FADD_R fsubd
#define FADD_I faddd
#define FMAC_R1 fnmacd
#define FMAC_R2 fmacd
#define FMAC_I1 fnmacd
#define FMAC_I2 fnmacd
#endif
.macro INIT2x2
vsub.f64 d16 , d16 , d16
vmov.f64 d17, d16
vmov.f64 d18, d16
vmov.f64 d19, d16
vmov.f64 d20, d16
vmov.f64 d21, d16
vmov.f64 d22, d16
vmov.f64 d23, d16
vmov.f64 d24, d16
vmov.f64 d25, d16
vmov.f64 d26, d16
vmov.f64 d27, d16
vmov.f64 d28, d16
vmov.f64 d29, d16
vmov.f64 d30, d16
vmov.f64 d31, d16
.endm
.macro KERNEL2x2_I
pld [ AO , #A_PRE ]
pld [ BO , #B_PRE ]
fldd d0 , [ AO ]
fldd d1 , [ AO, #8 ]
fldd d8 , [ BO ]
fldd d9 , [ BO, #8 ]
fmuld d16 , d0, d8
fldd d2 , [ AO, #16 ]
fmuld d24 , d1, d9
fldd d3 , [ AO, #24 ]
fmuld d17 , d0, d9
fldd d10, [ BO, #16 ]
fmuld d25 , d1, d8
fldd d11, [ BO, #24 ]
fmuld d18 , d2, d8
add BO , BO, #32
fmuld d26 , d3, d9
add AO , AO, #32
fmuld d19 , d2, d9
pld [ BO , #B_PRE ]
fmuld d27 , d3, d8
pld [ AO , #A_PRE ]
fmuld d20 , d0, d10
fldd d4 , [ AO, #0 ]
fmuld d28 , d1, d11
fldd d5 , [ AO, #8 ]
fmuld d21 , d0, d11
fldd d12, [ BO ]
fmuld d29 , d1, d10
fldd d13, [ BO, #8 ]
fmuld d22 , d2, d10
fldd d6 , [ AO, #16 ]
fmuld d30 , d3, d11
fldd d7 , [ AO, #24 ]
fmuld d23 , d2, d11
fldd d14, [ BO, #16 ]
fmuld d31 , d3, d10
fldd d15, [ BO, #24 ]
add BO , BO, #32
add AO , AO, #32
.endm
.macro KERNEL2x2_M1
pld [ AO , #A_PRE ]
fmacd d16 , d0, d8
pld [ BO , #B_PRE ]
fmacd d24 , d1, d9
fldd d4 , [ AO, #0 ]
fmacd d17 , d0, d9
fldd d5 , [ AO, #8 ]
fmacd d25 , d1, d8
fldd d12, [ BO ]
fmacd d18 , d2, d8
fldd d13, [ BO, #8 ]
fmacd d26 , d3, d9
fldd d6 , [ AO, #16 ]
fmacd d19 , d2, d9
fldd d7 , [ AO, #24 ]
fmacd d27 , d3, d8
fmacd d20 , d0, d10
fldd d14, [ BO, #16 ]
fmacd d28 , d1, d11
fmacd d21 , d0, d11
fldd d15, [ BO, #24 ]
fmacd d29 , d1, d10
fmacd d22 , d2, d10
add BO , BO, #32
fmacd d30 , d3, d11
fmacd d23 , d2, d11
add AO , AO, #32
fmacd d31 , d3, d10
.endm
.macro KERNEL2x2_M2
pld [ AO , #A_PRE ]
fmacd d16 , d4, d12
pld [ BO , #B_PRE ]
fmacd d24 , d5, d13
fldd d0 , [ AO, #0 ]
fmacd d17 , d4, d13
fldd d1 , [ AO, #8 ]
fmacd d25 , d5, d12
fmacd d18 , d6, d12
fldd d8 , [ BO ]
fmacd d26 , d7, d13
fldd d9 , [ BO, #8 ]
fmacd d19 , d6, d13
fmacd d27 , d7, d12
fldd d2 , [ AO, #16 ]
fmacd d20 , d4, d14
fldd d3 , [ AO, #24 ]
fmacd d28 , d5, d15
fmacd d21 , d4, d15
fldd d10, [ BO, #16 ]
fmacd d29 , d5, d14
fldd d11, [ BO, #24 ]
fmacd d22 , d6, d14
fmacd d30 , d7, d15
add BO , BO, #32
fmacd d23 , d6, d15
add AO , AO, #32
fmacd d31 , d7, d14
.endm
.macro KERNEL2x2_E
fmacd d16 , d4, d12
fmacd d24 , d5, d13
fmacd d17 , d4, d13
fmacd d25 , d5, d12
fmacd d18 , d6, d12
fmacd d26 , d7, d13
fmacd d19 , d6, d13
fmacd d27 , d7, d12
fmacd d20 , d4, d14
fmacd d28 , d5, d15
fmacd d21 , d4, d15
fmacd d29 , d5, d14
fmacd d22 , d6, d14
fmacd d30 , d7, d15
fmacd d23 , d6, d15
fmacd d31 , d7, d14
.endm
.macro KERNEL2x2_SUB
pld [ AO , #A_PRE ]
pld [ BO , #B_PRE ]
fldd d0 , [ AO ]
fldd d1 , [ AO, #8 ]
fldd d8 , [ BO ]
fldd d9 , [ BO, #8 ]
fmacd d16 , d0, d8
fldd d2 , [ AO, #16 ]
fmacd d24 , d1, d9
fldd d3 , [ AO, #24 ]
fmacd d17 , d0, d9
fldd d10, [ BO, #16 ]
fmacd d25 , d1, d8
fldd d11, [ BO, #24 ]
fmacd d18 , d2, d8
fmacd d26 , d3, d9
fmacd d19 , d2, d9
fmacd d27 , d3, d8
fmacd d20 , d0, d10
fmacd d28 , d1, d11
fmacd d21 , d0, d11
fmacd d29 , d1, d10
fmacd d22 , d2, d10
add BO , BO, #32
fmacd d30 , d3, d11
fmacd d23 , d2, d11
add AO , AO, #32
fmacd d31 , d3, d10
.endm
.macro SAVE2x2
ldr r3, OLD_RSC // Row stride size
lsl r3, r3, #4 // multiply with size of complex double
fldd d0, [ PTR_ALPHA ] // load real part of alpha
fldd d1, [ PTR_ALPHA, #8 ] // load imag part of alpha
ldr r4, PTR_BETA
fldd d2, [ r4 ] // load real part of beta
fldd d3, [ r4, #8 ] // load imag part of beta
// Add/Sub the real and the imag parts
FADD_R d16, d24 , d16
FADD_I d17, d25 , d17
FADD_R d18, d26 , d18
FADD_I d19, d27 , d19
FADD_R d20, d28 , d20
FADD_I d21, d29 , d21
FADD_R d22, d30 , d22
FADD_I d23, d31 , d23
mov r4, CO1 // save pointer
fldmiad CO1, { d4 - d5 } // read real and imag part from C
add CO1, CO1, r3
mov r2, CO2 // save pointer
fldmiad CO2, { d8 - d9 } // read real and imag part from C
add CO2, CO2, r3
fmuld d24, d4, d2 // multiply Beta-real with C-real
fmuld d25, d5, d2 // multiply Beta-real with C-imag
fmuld d28, d8, d2 // multiply Beta-real with C-real
fmuld d29, d9, d2 // multiply Beta-real with C-imag
FMAC_BR d24, d3, d5 // multiply beta-imag with C-imag and add
FMAC_BI d25, d3, d4 // multiply beta-imag with C-real and add
FMAC_BR d28, d3, d9 // multiply beta-imag with C-imag and add
FMAC_BI d29, d3, d8 // multiply beta-imag with C-real and add
FMAC_R1 d24 , d0 , d16
FMAC_I1 d25 , d0 , d17
FMAC_R2 d24 , d1 , d17
FMAC_I2 d25 , d1 , d16
FMAC_R1 d28 , d0 , d20
FMAC_I1 d29 , d0 , d21
FMAC_R2 d28 , d1 , d21
FMAC_I2 d29 , d1 , d20
fldmiad CO1, { d4 - d5 } // read real and imag part from C
fldmiad CO2, { d8 - d9 } // read real and imag part from C
fmuld d26, d4, d2 // multiply Beta-real with C-real
fmuld d27, d5, d2 // multiply Beta-real with C-imag
fmuld d30, d8, d2 // multiply Beta-real with C-real
fmuld d31, d9, d2 // multiply Beta-real with C-imag
FMAC_BR d26, d3, d5 // multiply beta-imag with C-imag and add
FMAC_BI d27, d3, d4 // multiply beta-imag with C-real and add
FMAC_BR d30, d3, d9 // multiply beta-imag with C-imag and add
FMAC_BI d31, d3, d8 // multiply beta-imag with C-real and add
FMAC_R1 d26 , d0 , d18
FMAC_I1 d27 , d0 , d19
FMAC_R2 d26 , d1 , d19
FMAC_I2 d27 , d1 , d18
FMAC_R1 d30, d0 , d22
FMAC_I1 d31, d0 , d23
FMAC_R2 d30, d1 , d23
FMAC_I2 d31, d1 , d22
mov CO1, r4 // restore pointer
mov CO2, r2 // restore pointer
fstmiad CO1, { d24 - d25 }
fstmiad CO2, { d28 - d29 }
add CO1, CO1, r3
add CO2, CO2, r3
fstmiad CO1, { d26 - d27 }
fstmiad CO2, { d30 - d31 }
.endm
/**************************************************************************************
* End of macro definitions
**************************************************************************************/
.arm
.global REALNAME
.func REALNAME
REALNAME:
push {r4 - r9, fp} // save register
add fp, sp, #28 // add number of saved register multiplied by size of int
sub sp, sp, #STACKSIZE // reserve stack
mov AO, OLD_A // pointer matrix A
mov BO, OLD_B // pointer matrix B
sub r3, fp, #128
vstm r3, { d8 - d15} // store floating point registers
ldr r2, OLD_C // pointer matrix C
ldr r3, OLD_CSC // Col stride size of C
lsl r3, r3, #4 // multiply with size of complex double
mov CO1, r2 // first line of C
add CO2, CO1, r3 // second line of C
pld [ CO1, #C_PRE ] // prefetch the lines of C
pld [ CO2, #C_PRE ] // prefetch the lines of C
zgemm_kernel_L2_M2_20:
asrs L , K, #3 // L = K / 8
cmp L , #2
blt zgemm_kernel_L2_M2_32
KERNEL2x2_I
KERNEL2x2_M2
KERNEL2x2_M1
KERNEL2x2_M2
KERNEL2x2_M1
KERNEL2x2_M2
KERNEL2x2_M1
KERNEL2x2_M2
subs L, L, #2
ble zgemm_kernel_L2_M2_22a
.align 5
zgemm_kernel_L2_M2_22:
KERNEL2x2_M1
KERNEL2x2_M2
KERNEL2x2_M1
KERNEL2x2_M2
KERNEL2x2_M1
KERNEL2x2_M2
KERNEL2x2_M1
KERNEL2x2_M2
subs L, L, #1
bgt zgemm_kernel_L2_M2_22
zgemm_kernel_L2_M2_22a:
KERNEL2x2_M1
KERNEL2x2_M2
KERNEL2x2_M1
KERNEL2x2_M2
KERNEL2x2_M1
KERNEL2x2_M2
KERNEL2x2_M1
KERNEL2x2_E
b zgemm_kernel_L2_M2_44
zgemm_kernel_L2_M2_32:
tst L, #1
ble zgemm_kernel_L2_M2_40
KERNEL2x2_I
KERNEL2x2_M2
KERNEL2x2_M1
KERNEL2x2_M2
KERNEL2x2_M1
KERNEL2x2_M2
KERNEL2x2_M1
KERNEL2x2_E
b zgemm_kernel_L2_M2_44
zgemm_kernel_L2_M2_40:
INIT2x2
zgemm_kernel_L2_M2_44:
ands L , K, #7 // L = K % 8
ble zgemm_kernel_L2_M2_100
zgemm_kernel_L2_M2_46:
KERNEL2x2_SUB
subs L, L, #1
bne zgemm_kernel_L2_M2_46
zgemm_kernel_L2_M2_100:
SAVE2x2
zgemm_kernel_L999:
sub r3, fp, #128
vldm r3, { d8 - d15} // restore floating point registers
sub sp, fp, #28
pop {r4 - r9, fp}
bx lr

108
config/armv7a/make_defs.mk Normal file
View File

@@ -0,0 +1,108 @@
#!/bin/bash
#
# BLIS
# An object-based framework for developing high-performance BLAS-like
# libraries.
#
# Copyright (C) 2014, The University of Texas
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
# - Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# - Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# - Neither the name of The University of Texas nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#
# Only include this block of code once.
ifndef MAKE_DEFS_MK_INCLUDED
MAKE_DEFS_MK_INCLUDED := yes
#
# --- Build definitions --------------------------------------------------------
#
# Variables corresponding to other configure-time options.
BLIS_ENABLE_VERBOSE_MAKE_OUTPUT := yes
BLIS_ENABLE_STATIC_BUILD := yes
BLIS_ENABLE_DYNAMIC_BUILD := no
#
# --- Utility program definitions ----------------------------------------------
#
SH := /bin/sh
MV := mv
MKDIR := mkdir -p
RM_F := rm -f
RM_RF := rm -rf
SYMLINK := ln -sf
FIND := find
GREP := grep
XARGS := xargs
RANLIB := ranlib
INSTALL := install -c
# Used to refresh CHANGELOG.
GIT := git
GIT_LOG := $(GIT) log --decorate
#
# --- Development tools definitions --------------------------------------------
#
# --- Determine the C compiler and related flags ---
CC := gcc
# Enable IEEE Standard 1003.1-2004 (POSIX.1d).
# NOTE: This is needed to enable posix_memalign().
CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L
CMISCFLAGS := -std=c99 -O3 -mfloat-abi=hard -mfpu=vfpv3 -marm -march=armv7-a #-g
CDBGFLAGS := #-g
CWARNFLAGS := -Wall
COPTFLAGS := -marm -march=armv7-a -mfpu=vfpv3 -O3 -mfloat-abi=hard #-g
CKOPTFLAGS := $(COPTFLAGS)
CVECFLAGS := #-msse3 # -mfpmath=sse
# Aggregate all of the flags into multiple groups: one for standard
# compilation, and one for each of the supported "special" compilation
# modes.
CFLAGS := $(CDBGFLAGS) $(COPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
CFLAGS_KERNELS := $(CDBGFLAGS) $(CKOPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
CFLAGS_NOOPT := $(CDBGFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
# --- Determine the archiver and related flags ---
AR := ar
ARFLAGS := cru
# --- Determine the linker and related flags ---
LINKER := $(CC)
LDFLAGS := -lm
# end of ifndef MAKE_DEFS_MK_INCLUDED conditional block
endif