mirror of
https://github.com/amd/blis.git
synced 2026-05-11 09:39:59 +00:00
Merge pull request #3 from figual/master
New ARM armv7a kernels and Assembly file consideration in Makefile
This commit is contained in:
20
Makefile
20
Makefile
@@ -317,18 +317,24 @@ CFLAGS_KERNELS := $(CFLAGS_KERNELS) $(VERS_DEF)
|
||||
# Convert source file paths to object file paths by replacing the base source
|
||||
# directories with the base object directories, and also replacing the source
|
||||
# file suffix (eg: '.c') with '.o'.
|
||||
MK_BLIS_CONFIG_OBJS := $(patsubst $(FRAME_PATH)/%.c, $(BASE_OBJ_FRAME_PATH)/%.o, \
|
||||
MK_BLIS_FRAME_OBJS := $(patsubst $(FRAME_PATH)/%.c, $(BASE_OBJ_FRAME_PATH)/%.o, \
|
||||
$(filter %.c, $(MK_FRAME_SRC)))
|
||||
MK_BLIS_CONFIG_NOOPT_OBJS := $(patsubst $(FRAME_PATH)/%.c, $(BASE_OBJ_FRAME_PATH)/%.o, \
|
||||
MK_BLIS_FRAME_NOOPT_OBJS := $(patsubst $(FRAME_PATH)/%.c, $(BASE_OBJ_FRAME_PATH)/%.o, \
|
||||
$(filter %.c, $(MK_FRAME_NOOPT_SRC)))
|
||||
MK_BLIS_CONFIG_KERNELS_OBJS := $(patsubst $(FRAME_PATH)/%.c, $(BASE_OBJ_FRAME_PATH)/%.o, \
|
||||
MK_BLIS_FRAME_KERNELS_OBJS := $(patsubst $(FRAME_PATH)/%.c, $(BASE_OBJ_FRAME_PATH)/%.o, \
|
||||
$(filter %.c, $(MK_FRAME_KERNELS_SRC)))
|
||||
|
||||
MK_BLIS_FRAME_OBJS := $(patsubst $(CONFIG_PATH)/%.c, $(BASE_OBJ_CONFIG_PATH)/%.o, \
|
||||
MK_BLIS_CONFIG_OBJS := $(patsubst $(CONFIG_PATH)/%.S, $(BASE_OBJ_CONFIG_PATH)/%.o, \
|
||||
$(filter %.S, $(MK_CONFIG_SRC)))
|
||||
MK_BLIS_CONFIG_OBJS += $(patsubst $(CONFIG_PATH)/%.c, $(BASE_OBJ_CONFIG_PATH)/%.o, \
|
||||
$(filter %.c, $(MK_CONFIG_SRC)))
|
||||
MK_BLIS_FRAME_NOOPT_OBJS := $(patsubst $(CONFIG_PATH)/%.c, $(BASE_OBJ_CONFIG_PATH)/%.o, \
|
||||
MK_BLIS_CONFIG_NOOPT_OBJS := $(patsubst $(CONFIG_PATH)/%.S, $(BASE_OBJ_CONFIG_PATH)/%.o, \
|
||||
$(filter %.S, $(MK_CONFIG_NOOPT_SRC)))
|
||||
MK_BLIS_CONFIG_NOOPT_OBJS += $(patsubst $(CONFIG_PATH)/%.c, $(BASE_OBJ_CONFIG_PATH)/%.o, \
|
||||
$(filter %.c, $(MK_CONFIG_NOOPT_SRC)))
|
||||
MK_BLIS_FRAME_KERNELS_OBJS := $(patsubst $(CONFIG_PATH)/%.c, $(BASE_OBJ_CONFIG_PATH)/%.o, \
|
||||
MK_BLIS_CONFIG_KERNELS_OBJS := $(patsubst $(CONFIG_PATH)/%.S, $(BASE_OBJ_CONFIG_PATH)/%.o, \
|
||||
$(filter %.S, $(MK_CONFIG_KERNELS_SRC)))
|
||||
MK_BLIS_CONFIG_KERNELS_OBJS += $(patsubst $(CONFIG_PATH)/%.c, $(BASE_OBJ_CONFIG_PATH)/%.o, \
|
||||
$(filter %.c, $(MK_CONFIG_KERNELS_SRC)))
|
||||
|
||||
# Combine all of the object files into some readily-accessible variables.
|
||||
@@ -427,7 +433,7 @@ else
|
||||
@$(CC) $(call get_cflags_for_obj,$@) -c $< -o $@
|
||||
endif
|
||||
|
||||
$(BASE_OBJ_CONFIG_PATH)/%.o: $(CONFIG_PATH)/%.c $(MK_HEADER_FILES) $(MAKE_DEFS_MK_PATH)
|
||||
$(BASE_OBJ_CONFIG_PATH)/%.o: $(CONFIG_PATH)/%.[cS] $(MK_HEADER_FILES) $(MAKE_DEFS_MK_PATH)
|
||||
ifeq ($(BLIS_ENABLE_VERBOSE_MAKE_OUTPUT),yes)
|
||||
$(CC) $(call get_cflags_for_obj,$@) -c $< -o $@
|
||||
else
|
||||
|
||||
169
config/armv7a/bli_config.h
Normal file
169
config/armv7a/bli_config.h
Normal file
@@ -0,0 +1,169 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#ifndef BLIS_CONFIG_H
|
||||
#define BLIS_CONFIG_H
|
||||
|
||||
|
||||
// -- OPERATING SYSTEM ---------------------------------------------------------
|
||||
|
||||
|
||||
|
||||
// -- INTEGER PROPERTIES -------------------------------------------------------
|
||||
|
||||
// The bit size of the integer type used to track values such as dimensions,
|
||||
// strides, diagonal offsets. A value of 32 results in BLIS using 32-bit signed
|
||||
// integers while 64 results in 64-bit integers. Any other value results in use
|
||||
// of the C99 type "long int". Note that this ONLY affects integers used
|
||||
// internally within BLIS as well as those exposed in the native BLAS-like BLIS
|
||||
// interface.
|
||||
#define BLIS_INT_TYPE_SIZE 32
|
||||
|
||||
|
||||
|
||||
// -- FLOATING-POINT PROPERTIES ------------------------------------------------
|
||||
|
||||
// Define the number of floating-point types supported, and the size of the
|
||||
// largest type.
|
||||
#define BLIS_NUM_FP_TYPES 4
|
||||
#define BLIS_MAX_TYPE_SIZE sizeof(dcomplex)
|
||||
|
||||
// Enable use of built-in C99 "float complex" and "double complex" types and
|
||||
// associated overloaded operations and functions? Disabling results in
|
||||
// scomplex and dcomplex being defined in terms of simple structs.
|
||||
//#define BLIS_ENABLE_C99_COMPLEX
|
||||
|
||||
|
||||
|
||||
// -- MULTITHREADING -----------------------------------------------------------
|
||||
|
||||
// The maximum number of BLIS threads that will run concurrently.
|
||||
#define BLIS_MAX_NUM_THREADS 1
|
||||
|
||||
|
||||
|
||||
// -- MEMORY ALLOCATION --------------------------------------------------------
|
||||
|
||||
// -- Contiguous (static) memory allocator --
|
||||
|
||||
// The number of MC x KC, KC x NC, and MC x NC blocks to reserve in the
|
||||
// contiguous memory pools.
|
||||
#define BLIS_NUM_MC_X_KC_BLOCKS BLIS_MAX_NUM_THREADS
|
||||
#define BLIS_NUM_KC_X_NC_BLOCKS BLIS_MAX_NUM_THREADS
|
||||
#define BLIS_NUM_MC_X_NC_BLOCKS 0
|
||||
|
||||
// The maximum preload byte offset is used to pad the end of the contiguous
|
||||
// memory pools so that the micro-kernel, when computing with the end of the
|
||||
// last block, can exceed the bounds of the usable portion of the memory
|
||||
// region without causing a segmentation fault.
|
||||
#define BLIS_MAX_PRELOAD_BYTE_OFFSET 128
|
||||
|
||||
// -- Memory alignment --
|
||||
|
||||
// It is sometimes useful to define the various memory alignments in terms
|
||||
// of some other characteristics of the system, such as the cache line size
|
||||
// and the page size.
|
||||
#define BLIS_CACHE_LINE_SIZE 32
|
||||
#define BLIS_PAGE_SIZE 4096
|
||||
|
||||
// Alignment size needed by the instruction set for aligned SIMD/vector
|
||||
// instructions.
|
||||
#define BLIS_SIMD_ALIGN_SIZE 32
|
||||
|
||||
// Alignment size used to align local stack buffers within macro-kernel
|
||||
// functions.
|
||||
#define BLIS_STACK_BUF_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE
|
||||
|
||||
// Alignment size used when allocating memory dynamically from the operating
|
||||
// system (eg: posix_memalign()). To disable heap alignment and just use
|
||||
// malloc() instead, set this to 1.
|
||||
#define BLIS_HEAP_ADDR_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE
|
||||
|
||||
// Alignment size used when sizing leading dimensions of dynamically
|
||||
// allocated memory.
|
||||
#define BLIS_HEAP_STRIDE_ALIGN_SIZE BLIS_CACHE_LINE_SIZE
|
||||
|
||||
// Alignment size used when allocating entire blocks of contiguous memory
|
||||
// from the contiguous memory allocator.
|
||||
#define BLIS_CONTIG_ADDR_ALIGN_SIZE BLIS_PAGE_SIZE
|
||||
|
||||
// Alignment size used when sizing strides (eg: of packed micro-panels)
|
||||
// within a block of contiguous memory.
|
||||
#define BLIS_CONTIG_STRIDE_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE
|
||||
|
||||
|
||||
|
||||
// -- MIXED DATATYPE SUPPORT ---------------------------------------------------
|
||||
|
||||
// Basic (homogeneous) datatype support always enabled.
|
||||
|
||||
// Enable mixed domain operations?
|
||||
//#define BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
|
||||
// Enable extra mixed precision operations?
|
||||
//#define BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
|
||||
|
||||
|
||||
// -- MISCELLANEOUS OPTIONS ----------------------------------------------------
|
||||
|
||||
// Stay initialized after auto-initialization, unless and until the user
|
||||
// explicitly calls bli_finalize().
|
||||
#define BLIS_ENABLE_STAY_AUTO_INITIALIZED
|
||||
|
||||
|
||||
|
||||
// -- BLAS-to-BLIS COMPATIBILITY LAYER -----------------------------------------
|
||||
|
||||
// Enable the BLAS compatibility layer?
|
||||
#define BLIS_ENABLE_BLAS2BLIS
|
||||
|
||||
// The bit size of the integer type used to track values such as dimensions and
|
||||
// leading dimensions (ie: column strides) within the BLAS compatibility layer.
|
||||
// A value of 32 results in the compatibility layer using 32-bit signed integers
|
||||
// while 64 results in 64-bit integers. Any other value results in use of the
|
||||
// C99 type "long int". Note that this ONLY affects integers used within the
|
||||
// BLAS compatibility layer.
|
||||
#define BLIS_BLAS2BLIS_INT_TYPE_SIZE 32
|
||||
|
||||
// Fortran-77 name-mangling macros.
|
||||
#define PASTEF770(name) name ## _
|
||||
#define PASTEF77(ch1,name) ch1 ## name ## _
|
||||
#define PASTEF772(ch1,ch2,name) ch1 ## ch2 ## name ## _
|
||||
|
||||
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
348
config/armv7a/bli_kernel.h
Normal file
348
config/armv7a/bli_kernel.h
Normal file
@@ -0,0 +1,348 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#ifndef BLIS_KERNEL_H
|
||||
#define BLIS_KERNEL_H
|
||||
|
||||
|
||||
// -- LEVEL-3 MICRO-KERNEL CONSTANTS -------------------------------------------
|
||||
|
||||
// -- Default cache blocksizes --
|
||||
|
||||
//
|
||||
// Constraints:
|
||||
//
|
||||
// (1) MC must be a multiple of:
|
||||
// (a) MR (for zero-padding purposes)
|
||||
// (b) NR (for zero-padding purposes when MR and NR are "swapped")
|
||||
// (2) NC must be a multiple of
|
||||
// (a) NR (for zero-padding purposes)
|
||||
// (b) MR (for zero-padding purposes when MR and NR are "swapped")
|
||||
// (3) KC must be a multiple of
|
||||
// (a) MR and
|
||||
// (b) NR (for triangular operations such as trmm and trsm).
|
||||
//
|
||||
|
||||
#define BLIS_DEFAULT_MC_S 432
|
||||
#define BLIS_DEFAULT_KC_S 352
|
||||
#define BLIS_DEFAULT_NC_S 4096
|
||||
|
||||
#define BLIS_DEFAULT_MC_D 192
|
||||
#define BLIS_DEFAULT_KC_D 256
|
||||
#define BLIS_DEFAULT_NC_D 4096
|
||||
|
||||
#define BLIS_DEFAULT_MC_C 64
|
||||
#define BLIS_DEFAULT_KC_C 128
|
||||
#define BLIS_DEFAULT_NC_C 4096
|
||||
|
||||
#define BLIS_DEFAULT_MC_Z 64
|
||||
#define BLIS_DEFAULT_KC_Z 128
|
||||
#define BLIS_DEFAULT_NC_Z 4096
|
||||
|
||||
// -- Cache blocksize extensions (for optimizing edge cases) --
|
||||
|
||||
// NOTE: These cache blocksize "extensions" have the same constraints as
|
||||
// the corresponding default blocksizes above. When these values are
|
||||
// non-zero, blocksizes used at edge cases are extended (enlarged) if
|
||||
// such an extension would encompass the remaining portion of the
|
||||
// matrix dimension.
|
||||
|
||||
#define BLIS_EXTEND_MC_S 0 //(BLIS_DEFAULT_MC_S/4)
|
||||
#define BLIS_EXTEND_KC_S 0 //(BLIS_DEFAULT_KC_S/4)
|
||||
#define BLIS_EXTEND_NC_S 0 //(BLIS_DEFAULT_NC_S/4)
|
||||
|
||||
#define BLIS_EXTEND_MC_D 0 //(BLIS_DEFAULT_MC_D/4)
|
||||
#define BLIS_EXTEND_KC_D 0 //(BLIS_DEFAULT_KC_D/4)
|
||||
#define BLIS_EXTEND_NC_D 0 //(BLIS_DEFAULT_NC_D/4)
|
||||
|
||||
#define BLIS_EXTEND_MC_C 0 //(BLIS_DEFAULT_MC_C/4)
|
||||
#define BLIS_EXTEND_KC_C 0 //(BLIS_DEFAULT_KC_C/4)
|
||||
#define BLIS_EXTEND_NC_C 0 //(BLIS_DEFAULT_NC_C/4)
|
||||
|
||||
#define BLIS_EXTEND_MC_Z 0 //(BLIS_DEFAULT_MC_Z/4)
|
||||
#define BLIS_EXTEND_KC_Z 0 //(BLIS_DEFAULT_KC_Z/4)
|
||||
#define BLIS_EXTEND_NC_Z 0 //(BLIS_DEFAULT_NC_Z/4)
|
||||
|
||||
// -- Default register blocksizes for micro-kernel --
|
||||
|
||||
// NOTE: When using the reference configuration, these register blocksizes
|
||||
// in the m and n dimensions should all be equal to the size expected by
|
||||
// the reference micro-kernel(s).
|
||||
|
||||
#define BLIS_DEFAULT_MR_S 4
|
||||
#define BLIS_DEFAULT_NR_S 4
|
||||
|
||||
#define BLIS_DEFAULT_MR_D 4
|
||||
#define BLIS_DEFAULT_NR_D 4
|
||||
|
||||
#define BLIS_DEFAULT_MR_C 2
|
||||
#define BLIS_DEFAULT_NR_C 2
|
||||
|
||||
#define BLIS_DEFAULT_MR_Z 2
|
||||
#define BLIS_DEFAULT_NR_Z 2
|
||||
|
||||
// NOTE: If the micro-kernel, which is typically unrolled to a factor
|
||||
// of f, handles leftover edge cases (ie: when k % f > 0) then these
|
||||
// register blocksizes in the k dimension can be defined to 1.
|
||||
|
||||
#define BLIS_DEFAULT_KR_S 1
|
||||
#define BLIS_DEFAULT_KR_D 1
|
||||
#define BLIS_DEFAULT_KR_C 1
|
||||
#define BLIS_DEFAULT_KR_Z 1
|
||||
|
||||
// -- Register blocksize extensions (for packed micro-panels) --
|
||||
|
||||
// NOTE: These register blocksize "extensions" determine whether the
|
||||
// leading dimensions used within the packed micro-panels are equal to
|
||||
// or greater than their corresponding register blocksizes above.
|
||||
|
||||
#define BLIS_EXTEND_MR_S 0
|
||||
#define BLIS_EXTEND_NR_S 0
|
||||
|
||||
#define BLIS_EXTEND_MR_D 0
|
||||
#define BLIS_EXTEND_NR_D 0
|
||||
|
||||
#define BLIS_EXTEND_MR_C 0
|
||||
#define BLIS_EXTEND_NR_C 0
|
||||
|
||||
#define BLIS_EXTEND_MR_Z 0
|
||||
#define BLIS_EXTEND_NR_Z 0
|
||||
|
||||
// Register blocksize extensions in the k dimension are not used.
|
||||
|
||||
#define BLIS_EXTEND_KR_S 0
|
||||
#define BLIS_EXTEND_KR_D 0
|
||||
#define BLIS_EXTEND_KR_C 0
|
||||
#define BLIS_EXTEND_KR_Z 0
|
||||
|
||||
// -- Default incremental packing blocksizes (n dimension) --
|
||||
|
||||
// NOTE: These incremental packing blocksizes (for the n dimension) are only
|
||||
// used by certain blocked variants. But when the *are* used, they MUST be
|
||||
// be an integer multiple of NR!
|
||||
|
||||
#define BLIS_DEFAULT_NI_FAC 16
|
||||
#define BLIS_DEFAULT_NI_S (BLIS_DEFAULT_NI_FAC * BLIS_DEFAULT_NR_S)
|
||||
#define BLIS_DEFAULT_NI_D (BLIS_DEFAULT_NI_FAC * BLIS_DEFAULT_NR_D)
|
||||
#define BLIS_DEFAULT_NI_C (BLIS_DEFAULT_NI_FAC * BLIS_DEFAULT_NR_C)
|
||||
#define BLIS_DEFAULT_NI_Z (BLIS_DEFAULT_NI_FAC * BLIS_DEFAULT_NR_Z)
|
||||
|
||||
|
||||
|
||||
// -- LEVEL-2 KERNEL CONSTANTS -------------------------------------------------
|
||||
|
||||
// NOTE: These values determine high-level cache blocking for level-2
|
||||
// operations ONLY. So, if gemv is performed with a 2000x2000 matrix A and
|
||||
// MC = NC = 1000, then a total of four unblocked (or unblocked fused)
|
||||
// gemv subproblems are called. The blocked algorithms are only useful in
|
||||
// that they provide the opportunity for packing vectors. (Matrices can also
|
||||
// be packed here, but this tends to be much too expensive in practice to
|
||||
// actually employ.)
|
||||
|
||||
#define BLIS_DEFAULT_L2_MC_S 1000
|
||||
#define BLIS_DEFAULT_L2_NC_S 1000
|
||||
|
||||
#define BLIS_DEFAULT_L2_MC_D 1000
|
||||
#define BLIS_DEFAULT_L2_NC_D 1000
|
||||
|
||||
#define BLIS_DEFAULT_L2_MC_C 1000
|
||||
#define BLIS_DEFAULT_L2_NC_C 1000
|
||||
|
||||
#define BLIS_DEFAULT_L2_MC_Z 1000
|
||||
#define BLIS_DEFAULT_L2_NC_Z 1000
|
||||
|
||||
|
||||
|
||||
// -- LEVEL-1F KERNEL CONSTANTS ------------------------------------------------
|
||||
|
||||
// -- Default fusing factors for level-1f operations --
|
||||
|
||||
// NOTE: Default fusing factors are not used by the reference implementations
|
||||
// of level-1f operations. They are here only for use when these operations
|
||||
// are optimized.
|
||||
|
||||
#define BLIS_DEFAULT_FUSE_FAC_S 8
|
||||
#define BLIS_DEFAULT_FUSE_FAC_D 4
|
||||
#define BLIS_DEFAULT_FUSE_FAC_C 4
|
||||
#define BLIS_DEFAULT_FUSE_FAC_Z 2
|
||||
|
||||
#define BLIS_AXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
|
||||
#define BLIS_AXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
|
||||
#define BLIS_AXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
|
||||
#define BLIS_AXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
|
||||
|
||||
#define BLIS_DOTXF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
|
||||
#define BLIS_DOTXF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
|
||||
#define BLIS_DOTXF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
|
||||
#define BLIS_DOTXF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
|
||||
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
|
||||
|
||||
|
||||
|
||||
// -- LEVEL-1V KERNEL CONSTANTS ------------------------------------------------
|
||||
|
||||
// -- Default register blocksizes for vectors --
|
||||
|
||||
// NOTE: Register blocksizes for vectors are used when packing
|
||||
// non-contiguous vectors. Similar to that of KR, they can
|
||||
// typically be set to 1.
|
||||
|
||||
#define BLIS_DEFAULT_VR_S 1
|
||||
#define BLIS_DEFAULT_VR_D 1
|
||||
#define BLIS_DEFAULT_VR_C 1
|
||||
#define BLIS_DEFAULT_VR_Z 1
|
||||
|
||||
|
||||
|
||||
// -- LEVEL-3 KERNEL DEFINITIONS -----------------------------------------------
|
||||
|
||||
// -- gemm --
|
||||
|
||||
#include "bli_gemm_opt_4x4.h"
|
||||
#define GEMM_UKERNEL gemm_opt_4x4
|
||||
|
||||
// -- trsm-related --
|
||||
|
||||
#define GEMMTRSM_L_UKERNEL gemmtrsm_l_ref_mxn
|
||||
#define GEMMTRSM_U_UKERNEL gemmtrsm_u_ref_mxn
|
||||
|
||||
#define TRSM_L_UKERNEL trsm_l_ref_mxn
|
||||
#define TRSM_U_UKERNEL trsm_u_ref_mxn
|
||||
|
||||
|
||||
|
||||
// -- LEVEL-1M KERNEL DEFINITIONS ----------------------------------------------
|
||||
|
||||
// -- packm --
|
||||
|
||||
#define PACKM_2XK_KERNEL packm_ref_2xk
|
||||
#define PACKM_4XK_KERNEL packm_ref_4xk
|
||||
#define PACKM_6XK_KERNEL packm_ref_6xk
|
||||
#define PACKM_8XK_KERNEL packm_ref_8xk
|
||||
#define PACKM_10XK_KERNEL packm_ref_10xk
|
||||
#define PACKM_12XK_KERNEL packm_ref_12xk
|
||||
#define PACKM_14XK_KERNEL packm_ref_14xk
|
||||
#define PACKM_16XK_KERNEL packm_ref_16xk
|
||||
|
||||
// -- unpackm --
|
||||
|
||||
#define UNPACKM_2XK_KERNEL unpackm_ref_2xk
|
||||
#define UNPACKM_4XK_KERNEL unpackm_ref_4xk
|
||||
#define UNPACKM_6XK_KERNEL unpackm_ref_6xk
|
||||
#define UNPACKM_8XK_KERNEL unpackm_ref_8xk
|
||||
#define UNPACKM_10XK_KERNEL unpackm_ref_10xk
|
||||
#define UNPACKM_12XK_KERNEL unpackm_ref_12xk
|
||||
#define UNPACKM_14XK_KERNEL unpackm_ref_14xk
|
||||
#define UNPACKM_16XK_KERNEL unpackm_ref_16xk
|
||||
|
||||
|
||||
|
||||
// -- LEVEL-1F KERNEL DEFINITIONS ----------------------------------------------
|
||||
|
||||
// -- axpy2v --
|
||||
|
||||
#define AXPY2V_KERNEL axpy2v_unb_var1
|
||||
|
||||
// -- dotaxpyv --
|
||||
|
||||
#define DOTAXPYV_KERNEL dotaxpyv_unb_var1
|
||||
|
||||
// -- axpyf --
|
||||
|
||||
#define AXPYF_KERNEL axpyf_unb_var1
|
||||
|
||||
// -- dotxf --
|
||||
|
||||
#define DOTXF_KERNEL dotxf_unb_var1
|
||||
|
||||
// -- dotxaxpyf --
|
||||
|
||||
#define DOTXAXPYF_KERNEL dotxaxpyf_unb_var1
|
||||
|
||||
|
||||
|
||||
// -- LEVEL-1V KERNEL DEFINITIONS ----------------------------------------------
|
||||
|
||||
// -- addv --
|
||||
|
||||
#define ADDV_KERNEL addv_unb_var1
|
||||
|
||||
// -- axpyv --
|
||||
|
||||
#define AXPYV_KERNEL axpyv_unb_var1
|
||||
|
||||
// -- copyv --
|
||||
|
||||
#define COPYV_KERNEL copyv_unb_var1
|
||||
|
||||
// -- dotv --
|
||||
|
||||
#define DOTV_KERNEL dotv_unb_var1
|
||||
|
||||
// -- dotxv --
|
||||
|
||||
#define DOTXV_KERNEL dotxv_unb_var1
|
||||
|
||||
// -- invertv --
|
||||
|
||||
#define INVERTV_KERNEL invertv_unb_var1
|
||||
|
||||
// -- scal2v --
|
||||
|
||||
#define SCAL2V_KERNEL scal2v_unb_var1
|
||||
|
||||
// -- scalv --
|
||||
|
||||
#define SCALV_KERNEL scalv_unb_var1
|
||||
|
||||
// -- setv --
|
||||
|
||||
#define SETV_KERNEL setv_unb_var1
|
||||
|
||||
// -- subv --
|
||||
|
||||
#define SUBV_KERNEL subv_unb_var1
|
||||
|
||||
// -- swapv --
|
||||
|
||||
#define SWAPV_KERNEL swapv_unb_var1
|
||||
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
502
config/armv7a/kernels/3/bli_cgemm_kernel_2x2.S
Normal file
502
config/armv7a/kernels/3/bli_cgemm_kernel_2x2.S
Normal file
@@ -0,0 +1,502 @@
|
||||
|
||||
#define REALNAME bli_cgemm_kernel_2x2
|
||||
|
||||
#define STACKSIZE 256
|
||||
|
||||
#define K r0
|
||||
#define PTR_ALPHA r1
|
||||
#define OLD_A r2
|
||||
#define OLD_B r3
|
||||
#define PTR_BETA [fp, #0 ]
|
||||
#define OLD_C [fp, #4 ]
|
||||
#define OLD_RSC [fp, #8 ]
|
||||
#define OLD_CSC [fp, #12 ]
|
||||
#define AUX [fp, #16 ]
|
||||
|
||||
/******************************************************
|
||||
* [fp, #-128] - [fp, #-64] is reserved
|
||||
* for store and restore of floating point
|
||||
* register
|
||||
*******************************************************/
|
||||
|
||||
#define L r2
|
||||
|
||||
#define AO r5
|
||||
#define BO r6
|
||||
|
||||
#define CO1 r7
|
||||
#define CO2 r8
|
||||
|
||||
|
||||
#define A_PRE 96
|
||||
#define B_PRE 96
|
||||
#define C_PRE 0
|
||||
|
||||
/**************************************************************************************
|
||||
* Macro definitions
|
||||
**************************************************************************************/
|
||||
|
||||
#define FMAC_BR fnmacs
|
||||
#define FMAC_BI fmacs
|
||||
|
||||
#define NN 1
|
||||
|
||||
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
|
||||
|
||||
#define FADD_R fsubs
|
||||
#define FADD_I fadds
|
||||
|
||||
#define FMAC_R1 fnmacs
|
||||
#define FMAC_R2 fnmacs
|
||||
#define FMAC_I1 fmacs
|
||||
#define FMAC_I2 fnmacs
|
||||
|
||||
#elif defined(CN) || defined(CT)
|
||||
|
||||
#define FADD_R fadds
|
||||
#define FADD_I fsubs
|
||||
|
||||
#define FMAC_R1 fmacs
|
||||
#define FMAC_R2 fmacs
|
||||
#define FMAC_I1 fnmacs
|
||||
#define FMAC_I2 fmacs
|
||||
|
||||
#elif defined(NC) || defined(TC)
|
||||
|
||||
#define FADD_R fadds
|
||||
#define FADD_I fsubs
|
||||
|
||||
#define FMAC_R1 fmacs
|
||||
#define FMAC_R2 fnmacs
|
||||
#define FMAC_I1 fmacs
|
||||
#define FMAC_I2 fmacs
|
||||
|
||||
#else
|
||||
|
||||
#define FADD_R fsubs
|
||||
#define FADD_I fadds
|
||||
|
||||
#define FMAC_R1 fnmacs
|
||||
#define FMAC_R2 fmacs
|
||||
#define FMAC_I1 fnmacs
|
||||
#define FMAC_I2 fnmacs
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
.macro INIT2x2
|
||||
|
||||
vsub.f32 s16 , s16 , s16
|
||||
vmov.f32 s17, s16
|
||||
vmov.f32 s18, s16
|
||||
vmov.f32 s19, s16
|
||||
vmov.f32 s20, s16
|
||||
vmov.f32 s21, s16
|
||||
vmov.f32 s22, s16
|
||||
vmov.f32 s23, s16
|
||||
vmov.f32 s24, s16
|
||||
vmov.f32 s25, s16
|
||||
vmov.f32 s26, s16
|
||||
vmov.f32 s27, s16
|
||||
vmov.f32 s28, s16
|
||||
vmov.f32 s29, s16
|
||||
vmov.f32 s30, s16
|
||||
vmov.f32 s31, s16
|
||||
|
||||
.endm
|
||||
|
||||
.macro KERNEL2x2_I
|
||||
pld [ AO , #A_PRE ]
|
||||
pld [ BO , #B_PRE ]
|
||||
flds s0 , [ AO ]
|
||||
flds s1 , [ AO, #4 ]
|
||||
flds s8 , [ BO ]
|
||||
flds s9 , [ BO, #4 ]
|
||||
|
||||
fmuls s16 , s0, s8
|
||||
flds s2 , [ AO, #8 ]
|
||||
fmuls s24 , s1, s9
|
||||
flds s3 , [ AO, #12 ]
|
||||
fmuls s17 , s0, s9
|
||||
flds s10, [ BO, #8 ]
|
||||
fmuls s25 , s1, s8
|
||||
|
||||
flds s11, [ BO, #12 ]
|
||||
fmuls s18 , s2, s8
|
||||
add BO , BO, #16
|
||||
fmuls s26 , s3, s9
|
||||
add AO , AO, #16
|
||||
fmuls s19 , s2, s9
|
||||
pld [ BO , #B_PRE ]
|
||||
fmuls s27 , s3, s8
|
||||
|
||||
pld [ AO , #A_PRE ]
|
||||
fmuls s20 , s0, s10
|
||||
flds s4 , [ AO, #0 ]
|
||||
fmuls s28 , s1, s11
|
||||
flds s5 , [ AO, #4 ]
|
||||
fmuls s21 , s0, s11
|
||||
flds s12, [ BO ]
|
||||
fmuls s29 , s1, s10
|
||||
|
||||
flds s13, [ BO, #4 ]
|
||||
fmuls s22 , s2, s10
|
||||
flds s6 , [ AO, #8 ]
|
||||
fmuls s30 , s3, s11
|
||||
flds s7 , [ AO, #12 ]
|
||||
fmuls s23 , s2, s11
|
||||
flds s14, [ BO, #8 ]
|
||||
fmuls s31 , s3, s10
|
||||
flds s15, [ BO, #12 ]
|
||||
|
||||
add BO , BO, #16
|
||||
add AO , AO, #16
|
||||
.endm
|
||||
|
||||
|
||||
|
||||
.macro KERNEL2x2_M1
|
||||
pld [ AO , #A_PRE ]
|
||||
|
||||
fmacs s16 , s0, s8
|
||||
pld [ BO , #B_PRE ]
|
||||
fmacs s24 , s1, s9
|
||||
flds s4 , [ AO, #0 ]
|
||||
fmacs s17 , s0, s9
|
||||
flds s5 , [ AO, #4 ]
|
||||
fmacs s25 , s1, s8
|
||||
|
||||
flds s12, [ BO ]
|
||||
fmacs s18 , s2, s8
|
||||
flds s13, [ BO, #4 ]
|
||||
fmacs s26 , s3, s9
|
||||
flds s6 , [ AO, #8 ]
|
||||
fmacs s19 , s2, s9
|
||||
flds s7 , [ AO, #12 ]
|
||||
fmacs s27 , s3, s8
|
||||
|
||||
fmacs s20 , s0, s10
|
||||
flds s14, [ BO, #8 ]
|
||||
fmacs s28 , s1, s11
|
||||
fmacs s21 , s0, s11
|
||||
flds s15, [ BO, #12 ]
|
||||
fmacs s29 , s1, s10
|
||||
|
||||
fmacs s22 , s2, s10
|
||||
add BO , BO, #16
|
||||
fmacs s30 , s3, s11
|
||||
fmacs s23 , s2, s11
|
||||
add AO , AO, #16
|
||||
fmacs s31 , s3, s10
|
||||
|
||||
.endm
|
||||
|
||||
.macro KERNEL2x2_M2
|
||||
|
||||
fmacs s16 , s4, s12
|
||||
fmacs s24 , s5, s13
|
||||
flds s0 , [ AO, #0 ]
|
||||
fmacs s17 , s4, s13
|
||||
flds s1 , [ AO, #4 ]
|
||||
fmacs s25 , s5, s12
|
||||
|
||||
fmacs s18 , s6, s12
|
||||
flds s8 , [ BO ]
|
||||
fmacs s26 , s7, s13
|
||||
flds s9 , [ BO, #4 ]
|
||||
fmacs s19 , s6, s13
|
||||
fmacs s27 , s7, s12
|
||||
|
||||
flds s2 , [ AO, #8 ]
|
||||
fmacs s20 , s4, s14
|
||||
flds s3 , [ AO, #12 ]
|
||||
fmacs s28 , s5, s15
|
||||
fmacs s21 , s4, s15
|
||||
flds s10, [ BO, #8 ]
|
||||
fmacs s29 , s5, s14
|
||||
|
||||
flds s11, [ BO, #12 ]
|
||||
fmacs s22 , s6, s14
|
||||
fmacs s30 , s7, s15
|
||||
add BO , BO, #16
|
||||
fmacs s23 , s6, s15
|
||||
add AO , AO, #16
|
||||
fmacs s31 , s7, s14
|
||||
|
||||
.endm
|
||||
|
||||
|
||||
.macro KERNEL2x2_E
|
||||
|
||||
fmacs s16 , s4, s12
|
||||
fmacs s24 , s5, s13
|
||||
fmacs s17 , s4, s13
|
||||
fmacs s25 , s5, s12
|
||||
|
||||
fmacs s18 , s6, s12
|
||||
fmacs s26 , s7, s13
|
||||
fmacs s19 , s6, s13
|
||||
fmacs s27 , s7, s12
|
||||
|
||||
fmacs s20 , s4, s14
|
||||
fmacs s28 , s5, s15
|
||||
fmacs s21 , s4, s15
|
||||
fmacs s29 , s5, s14
|
||||
|
||||
fmacs s22 , s6, s14
|
||||
fmacs s30 , s7, s15
|
||||
fmacs s23 , s6, s15
|
||||
fmacs s31 , s7, s14
|
||||
|
||||
.endm
|
||||
|
||||
.macro KERNEL2x2_SUB
|
||||
|
||||
flds s0 , [ AO ]
|
||||
flds s1 , [ AO, #4 ]
|
||||
flds s8 , [ BO ]
|
||||
flds s9 , [ BO, #4 ]
|
||||
|
||||
fmacs s16 , s0, s8
|
||||
flds s2 , [ AO, #8 ]
|
||||
fmacs s24 , s1, s9
|
||||
flds s3 , [ AO, #12 ]
|
||||
fmacs s17 , s0, s9
|
||||
flds s10, [ BO, #8 ]
|
||||
fmacs s25 , s1, s8
|
||||
|
||||
flds s11, [ BO, #12 ]
|
||||
fmacs s18 , s2, s8
|
||||
fmacs s26 , s3, s9
|
||||
fmacs s19 , s2, s9
|
||||
fmacs s27 , s3, s8
|
||||
|
||||
fmacs s20 , s0, s10
|
||||
fmacs s28 , s1, s11
|
||||
fmacs s21 , s0, s11
|
||||
fmacs s29 , s1, s10
|
||||
|
||||
fmacs s22 , s2, s10
|
||||
add BO , BO, #16
|
||||
fmacs s30 , s3, s11
|
||||
fmacs s23 , s2, s11
|
||||
add AO , AO, #16
|
||||
fmacs s31 , s3, s10
|
||||
|
||||
.endm
|
||||
|
||||
|
||||
|
||||
|
||||
.macro SAVE2x2
|
||||
|
||||
ldr r3, OLD_RSC // Row stride size
|
||||
lsl r3, r3, #3 // multiply with size of complex float
|
||||
|
||||
flds s0, [ PTR_ALPHA ] // load real part of alpha
|
||||
flds s1, [ PTR_ALPHA, #4 ] // load imag part of alpha
|
||||
ldr r4, PTR_BETA
|
||||
flds s2, [ r4 ] // load real part of beta
|
||||
flds s3, [ r4, #4 ] // load imag part of beta
|
||||
|
||||
// Add/Sub the real and the imag parts
|
||||
FADD_R s16, s24 , s16
|
||||
FADD_I s17, s25 , s17
|
||||
FADD_R s18, s26 , s18
|
||||
FADD_I s19, s27 , s19
|
||||
FADD_R s20, s28 , s20
|
||||
FADD_I s21, s29 , s21
|
||||
FADD_R s22, s30 , s22
|
||||
FADD_I s23, s31 , s23
|
||||
|
||||
mov r4, CO1 // save pointer
|
||||
fldmias CO1, { s4 - s5 } // read real and imag part from C
|
||||
add CO1, CO1, r3
|
||||
|
||||
mov r2, CO2 // save pointer
|
||||
fldmias CO2, { s8 - s9 } // read real and imag part from C
|
||||
add CO2, CO2, r3
|
||||
|
||||
fmuls s24, s4, s2 // multiply Beta-real with C-real
|
||||
fmuls s25, s5, s2 // multiply Beta-real with C-imag
|
||||
fmuls s28, s8, s2 // multiply Beta-real with C-real
|
||||
fmuls s29, s9, s2 // multiply Beta-real with C-imag
|
||||
|
||||
FMAC_BR s24, s3, s5 // multiply beta-imag with C-imag and add
|
||||
FMAC_BI s25, s3, s4 // multiply beta-imag with C-real and add
|
||||
FMAC_BR s28, s3, s9 // multiply beta-imag with C-imag and add
|
||||
FMAC_BI s29, s3, s8 // multiply beta-imag with C-real and add
|
||||
|
||||
FMAC_R1 s24 , s0 , s16
|
||||
FMAC_I1 s25 , s0 , s17
|
||||
FMAC_R2 s24 , s1 , s17
|
||||
FMAC_I2 s25 , s1 , s16
|
||||
|
||||
FMAC_R1 s28 , s0 , s20
|
||||
FMAC_I1 s29 , s0 , s21
|
||||
FMAC_R2 s28 , s1 , s21
|
||||
FMAC_I2 s29 , s1 , s20
|
||||
|
||||
fldmias CO1, { s4 - s5 } // read real and imag part from C
|
||||
fldmias CO2, { s8 - s9 } // read real and imag part from C
|
||||
|
||||
fmuls s26, s4, s2 // multiply Beta-real with C-real
|
||||
fmuls s27, s5, s2 // multiply Beta-real with C-imag
|
||||
fmuls s30, s8, s2 // multiply Beta-real with C-real
|
||||
fmuls s31, s9, s2 // multiply Beta-real with C-imag
|
||||
|
||||
FMAC_BR s26, s3, s5 // multiply beta-imag with C-imag and add
|
||||
FMAC_BI s27, s3, s4 // multiply beta-imag with C-real and add
|
||||
FMAC_BR s30, s3, s9 // multiply beta-imag with C-imag and add
|
||||
FMAC_BI s31, s3, s8 // multiply beta-imag with C-real and add
|
||||
|
||||
FMAC_R1 s26 , s0 , s18
|
||||
FMAC_I1 s27 , s0 , s19
|
||||
FMAC_R2 s26 , s1 , s19
|
||||
FMAC_I2 s27 , s1 , s18
|
||||
|
||||
FMAC_R1 s30, s0 , s22
|
||||
FMAC_I1 s31, s0 , s23
|
||||
FMAC_R2 s30, s1 , s23
|
||||
FMAC_I2 s31, s1 , s22
|
||||
|
||||
mov CO1, r4 // restore pointer
|
||||
mov CO2, r2 // restore pointer
|
||||
fstmias CO1, { s24 - s25 }
|
||||
fstmias CO2, { s28 - s29 }
|
||||
add CO1, CO1, r3
|
||||
add CO2, CO2, r3
|
||||
fstmias CO1, { s26 - s27 }
|
||||
fstmias CO2, { s30 - s31 }
|
||||
|
||||
|
||||
.endm
|
||||
|
||||
|
||||
|
||||
/**************************************************************************************
|
||||
* End of macro definitions
|
||||
**************************************************************************************/
|
||||
|
||||
.arm
|
||||
.global REALNAME
|
||||
.func REALNAME
|
||||
|
||||
REALNAME:
|
||||
|
||||
push {r4 - r9, fp} // save register
|
||||
add fp, sp, #28 // add number of saved register multiplied by size of int
|
||||
sub sp, sp, #STACKSIZE // reserve stack
|
||||
|
||||
mov AO, OLD_A // pointer matrix A
|
||||
mov BO, OLD_B // pointer matrix B
|
||||
|
||||
sub r3, fp, #128
|
||||
vstm r3, { s8 - s31} // store floating point registers
|
||||
|
||||
ldr r2, OLD_C // pointer matrix C
|
||||
ldr r3, OLD_CSC // Col stride size of C
|
||||
lsl r3, r3, #3 // multiply with size of complex float
|
||||
|
||||
mov CO1, r2 // first line of C
|
||||
add CO2, CO1, r3 // second line of C
|
||||
|
||||
pld [ CO1, #C_PRE ] // prefetch the lines of C
|
||||
pld [ CO2, #C_PRE ] // prefetch the lines of C
|
||||
|
||||
cgemm_kernel_L2_M2_20:
|
||||
|
||||
asrs L , K, #3 // L = K / 8
|
||||
cmp L , #2
|
||||
blt cgemm_kernel_L2_M2_32
|
||||
|
||||
KERNEL2x2_I
|
||||
KERNEL2x2_M2
|
||||
KERNEL2x2_M1
|
||||
KERNEL2x2_M2
|
||||
|
||||
KERNEL2x2_M1
|
||||
KERNEL2x2_M2
|
||||
KERNEL2x2_M1
|
||||
KERNEL2x2_M2
|
||||
|
||||
subs L, L, #2
|
||||
ble cgemm_kernel_L2_M2_22a
|
||||
.align 5
|
||||
|
||||
cgemm_kernel_L2_M2_22:
|
||||
|
||||
KERNEL2x2_M1
|
||||
KERNEL2x2_M2
|
||||
KERNEL2x2_M1
|
||||
KERNEL2x2_M2
|
||||
|
||||
KERNEL2x2_M1
|
||||
KERNEL2x2_M2
|
||||
KERNEL2x2_M1
|
||||
KERNEL2x2_M2
|
||||
|
||||
subs L, L, #1
|
||||
bgt cgemm_kernel_L2_M2_22
|
||||
|
||||
cgemm_kernel_L2_M2_22a:
|
||||
|
||||
KERNEL2x2_M1
|
||||
KERNEL2x2_M2
|
||||
KERNEL2x2_M1
|
||||
KERNEL2x2_M2
|
||||
|
||||
KERNEL2x2_M1
|
||||
KERNEL2x2_M2
|
||||
KERNEL2x2_M1
|
||||
KERNEL2x2_E
|
||||
|
||||
b cgemm_kernel_L2_M2_44
|
||||
|
||||
cgemm_kernel_L2_M2_32:
|
||||
|
||||
tst L, #1
|
||||
ble cgemm_kernel_L2_M2_40
|
||||
|
||||
KERNEL2x2_I
|
||||
KERNEL2x2_M2
|
||||
KERNEL2x2_M1
|
||||
KERNEL2x2_M2
|
||||
|
||||
KERNEL2x2_M1
|
||||
KERNEL2x2_M2
|
||||
KERNEL2x2_M1
|
||||
KERNEL2x2_E
|
||||
|
||||
b cgemm_kernel_L2_M2_44
|
||||
|
||||
cgemm_kernel_L2_M2_40:
|
||||
|
||||
INIT2x2
|
||||
|
||||
cgemm_kernel_L2_M2_44:
|
||||
|
||||
ands L , K, #7 // L = K % 8
|
||||
ble cgemm_kernel_L2_M2_100
|
||||
|
||||
cgemm_kernel_L2_M2_46:
|
||||
|
||||
KERNEL2x2_SUB
|
||||
|
||||
subs L, L, #1
|
||||
bne cgemm_kernel_L2_M2_46
|
||||
|
||||
cgemm_kernel_L2_M2_100:
|
||||
|
||||
SAVE2x2
|
||||
|
||||
cgemm_kernel_L999:
|
||||
|
||||
sub r3, fp, #128
|
||||
vldm r3, { s8 - s31} // restore floating point registers
|
||||
|
||||
sub sp, fp, #28
|
||||
pop {r4 - r9, fp}
|
||||
bx lr
|
||||
|
||||
503
config/armv7a/kernels/3/bli_dgemm_kernel_4x4.S
Normal file
503
config/armv7a/kernels/3/bli_dgemm_kernel_4x4.S
Normal file
@@ -0,0 +1,503 @@
|
||||
|
||||
#define REALNAME bli_dgemm_kernel_4x4
|
||||
|
||||
#define STACKSIZE 256
|
||||
|
||||
#define K r0
|
||||
#define PTR_ALPHA r1
|
||||
#define OLD_A r2
|
||||
#define OLD_B r3
|
||||
#define PTR_BETA [fp, #0 ]
|
||||
#define OLD_C [fp, #4 ]
|
||||
#define OLD_RSC [fp, #8 ]
|
||||
#define OLD_CSC [fp, #12 ]
|
||||
#define AUX [fp, #16 ]
|
||||
|
||||
/******************************************************
|
||||
* [fp, #-128] - [fp, #-64] is reserved
|
||||
* for store and restore of floating point
|
||||
* register
|
||||
*******************************************************/
|
||||
|
||||
#define L r2
|
||||
|
||||
#define AO r5
|
||||
#define BO r6
|
||||
|
||||
#define CO1 r7
|
||||
#define CO2 r8
|
||||
#define CO3 r9
|
||||
#define CO4 r12
|
||||
|
||||
|
||||
#define A_PRE 96
|
||||
#define B_PRE 96
|
||||
#define C_PRE 0
|
||||
|
||||
/**************************************************************************************
|
||||
* Macro definitions
|
||||
**************************************************************************************/
|
||||
|
||||
.macro INIT4x4
|
||||
|
||||
vsub.f64 d16 , d16 , d16
|
||||
vmov.f64 d17, d16
|
||||
vmov.f64 d18, d16
|
||||
vmov.f64 d19, d16
|
||||
vmov.f64 d20, d16
|
||||
vmov.f64 d21, d16
|
||||
vmov.f64 d22, d16
|
||||
vmov.f64 d23, d16
|
||||
vmov.f64 d24, d16
|
||||
vmov.f64 d25, d16
|
||||
vmov.f64 d26, d16
|
||||
vmov.f64 d27, d16
|
||||
vmov.f64 d28, d16
|
||||
vmov.f64 d29, d16
|
||||
vmov.f64 d30, d16
|
||||
vmov.f64 d31, d16
|
||||
|
||||
.endm
|
||||
|
||||
.macro KERNEL4x4_I
|
||||
pld [ BO , #B_PRE ]
|
||||
fldd d8 , [ BO ]
|
||||
fldd d0 , [ AO ]
|
||||
pld [ AO , #A_PRE ]
|
||||
|
||||
fldd d1 , [ AO, #8 ]
|
||||
fmuld d16 , d0, d8
|
||||
fldd d2 , [ AO, #16 ]
|
||||
fmuld d17 , d1, d8
|
||||
fldd d3 , [ AO, #24 ]
|
||||
fmuld d18 , d2, d8
|
||||
fldd d9 , [ BO, #8 ]
|
||||
fmuld d19 , d3, d8
|
||||
|
||||
fldd d10, [ BO, #16 ]
|
||||
fmuld d20 , d0, d9
|
||||
fldd d11, [ BO, #24 ]
|
||||
fmuld d21 , d1, d9
|
||||
add BO , BO, #32
|
||||
add AO , AO, #32
|
||||
fmuld d22 , d2, d9
|
||||
|
||||
pld [ BO , #B_PRE ]
|
||||
fldd d12, [ BO ]
|
||||
fmuld d23 , d3, d9
|
||||
|
||||
pld [ AO , #A_PRE ]
|
||||
fldd d4 , [ AO, #0 ]
|
||||
fmuld d24 , d0, d10
|
||||
fldd d5 , [ AO, #8 ]
|
||||
fmuld d25 , d1, d10
|
||||
fldd d6 , [ AO, #16 ]
|
||||
fmuld d26 , d2, d10
|
||||
fldd d7 , [ AO, #24 ]
|
||||
fmuld d27 , d3, d10
|
||||
|
||||
fldd d13, [ BO, #8 ]
|
||||
fmuld d28 , d0, d11
|
||||
fldd d14, [ BO, #16 ]
|
||||
fmuld d29 , d1, d11
|
||||
fldd d15, [ BO, #24 ]
|
||||
fmuld d30 , d2, d11
|
||||
fmuld d31 , d3, d11
|
||||
|
||||
.endm
|
||||
|
||||
.macro KERNEL4x4_M2
|
||||
|
||||
fmacd d16 , d4, d12
|
||||
pld [ AO , #A_PRE+32 ]
|
||||
fmacd d17 , d5, d12
|
||||
fldd d0 , [ AO , #32 ]
|
||||
fmacd d18 , d6, d12
|
||||
pld [ BO , #B_PRE+32 ]
|
||||
fmacd d19 , d7, d12
|
||||
|
||||
fldd d8 , [ BO , #32 ]
|
||||
fmacd d20 , d4, d13
|
||||
fldd d1 , [ AO, #40 ]
|
||||
fmacd d21 , d5, d13
|
||||
fldd d2 , [ AO, #48 ]
|
||||
fmacd d22 , d6, d13
|
||||
fldd d3 , [ AO, #56 ]
|
||||
fmacd d23 , d7, d13
|
||||
|
||||
fmacd d24 , d4, d14
|
||||
fmacd d25 , d5, d14
|
||||
fldd d9 , [ BO, #40 ]
|
||||
fmacd d26 , d6, d14
|
||||
fldd d10, [ BO, #48 ]
|
||||
fmacd d27 , d7, d14
|
||||
|
||||
fldd d11, [ BO, #56 ]
|
||||
fmacd d28 , d4, d15
|
||||
fmacd d29 , d5, d15
|
||||
add AO , AO, #64
|
||||
fmacd d30 , d6, d15
|
||||
add BO , BO, #64
|
||||
fmacd d31 , d7, d15
|
||||
|
||||
.endm
|
||||
|
||||
.macro KERNEL4x4_M1
|
||||
|
||||
fmacd d16 , d0, d8
|
||||
pld [ AO , #A_PRE ]
|
||||
fmacd d17 , d1, d8
|
||||
fldd d4 , [ AO ]
|
||||
fmacd d18 , d2, d8
|
||||
pld [ BO , #B_PRE ]
|
||||
fmacd d19 , d3, d8
|
||||
|
||||
fldd d12, [ BO ]
|
||||
fmacd d20 , d0, d9
|
||||
fldd d5 , [ AO, #8 ]
|
||||
fmacd d21 , d1, d9
|
||||
fldd d6 , [ AO, #16 ]
|
||||
fmacd d22 , d2, d9
|
||||
fldd d7 , [ AO, #24 ]
|
||||
fmacd d23 , d3, d9
|
||||
|
||||
fmacd d24 , d0, d10
|
||||
fmacd d25 , d1, d10
|
||||
fldd d13, [ BO, #8 ]
|
||||
fmacd d26 , d2, d10
|
||||
fldd d14, [ BO, #16 ]
|
||||
fmacd d27 , d3, d10
|
||||
|
||||
fldd d15, [ BO, #24 ]
|
||||
fmacd d28 , d0, d11
|
||||
fmacd d29 , d1, d11
|
||||
fmacd d30 , d2, d11
|
||||
fmacd d31 , d3, d11
|
||||
|
||||
.endm
|
||||
|
||||
.macro KERNEL4x4_E
|
||||
|
||||
fmacd d16 , d4, d12
|
||||
fmacd d17 , d5, d12
|
||||
add BO , BO, #32
|
||||
fmacd d18 , d6, d12
|
||||
add AO , AO, #32
|
||||
fmacd d19 , d7, d12
|
||||
|
||||
fmacd d20 , d4, d13
|
||||
fmacd d21 , d5, d13
|
||||
fmacd d22 , d6, d13
|
||||
fmacd d23 , d7, d13
|
||||
|
||||
fmacd d24 , d4, d14
|
||||
fmacd d25 , d5, d14
|
||||
fmacd d26 , d6, d14
|
||||
fmacd d27 , d7, d14
|
||||
|
||||
fmacd d28 , d4, d15
|
||||
fmacd d29 , d5, d15
|
||||
fmacd d30 , d6, d15
|
||||
fmacd d31 , d7, d15
|
||||
|
||||
.endm
|
||||
|
||||
.macro KERNEL4x4_SUB
|
||||
|
||||
fldd d8 , [ BO ]
|
||||
pld [ BO , #B_PRE ]
|
||||
|
||||
fldd d0 , [ AO ]
|
||||
pld [ AO , #A_PRE ]
|
||||
fldd d1 , [ AO, #8 ]
|
||||
|
||||
fmacd d16 , d0, d8
|
||||
fldd d2 , [ AO, #16 ]
|
||||
fmacd d17 , d1, d8
|
||||
fldd d3 , [ AO, #24 ]
|
||||
fmacd d18 , d2, d8
|
||||
fldd d9 , [ BO, #8 ]
|
||||
fmacd d19 , d3, d8
|
||||
|
||||
fldd d10, [ BO, #16 ]
|
||||
fmacd d20 , d0, d9
|
||||
fldd d11, [ BO, #24 ]
|
||||
fmacd d21 , d1, d9
|
||||
fmacd d22 , d2, d9
|
||||
fmacd d23 , d3, d9
|
||||
|
||||
fmacd d24 , d0, d10
|
||||
fmacd d25 , d1, d10
|
||||
fmacd d26 , d2, d10
|
||||
fmacd d27 , d3, d10
|
||||
|
||||
fmacd d28 , d0, d11
|
||||
fmacd d29 , d1, d11
|
||||
add AO , AO, #32
|
||||
fmacd d30 , d2, d11
|
||||
add BO , BO, #32
|
||||
fmacd d31 , d3, d11
|
||||
|
||||
.endm
|
||||
|
||||
.macro SAVE4x4
|
||||
|
||||
ldr r3, OLD_RSC // Row stride size
|
||||
lsl r3, r3, #3 // multiply with size of double
|
||||
|
||||
fldd d0, [ PTR_ALPHA ] // load alpha
|
||||
ldr r4, PTR_BETA
|
||||
fldd d1, [ r4 ] // load beta
|
||||
|
||||
//-----------------------------------------------------------
|
||||
mov r2, CO1 // save pointer
|
||||
mov r4, CO2 // save pointer
|
||||
fldd d8, [ CO1 ] // load value from C
|
||||
fldd d12, [ CO2 ] // load value from C
|
||||
fmuld d8, d8, d1 // multiply with beta
|
||||
add CO1, CO1, r3 // compute next pointer
|
||||
fmacd d8, d0, d16 // multiply sum with alpha and add to value of C
|
||||
add CO2, CO2, r3 // compute next pointer
|
||||
|
||||
fldd d9, [ CO1 ] // load value from C
|
||||
fldd d13, [ CO2 ] // load value from C
|
||||
fmuld d9, d9, d1 // multiply with beta
|
||||
add CO1, CO1, r3 // compute next pointer
|
||||
fmacd d9, d0, d17 // multiply sum with alpha and add to value of C
|
||||
add CO2, CO2, r3 // compute next pointer
|
||||
|
||||
fldd d10, [ CO1 ] // load value from C
|
||||
fldd d14, [ CO2 ] // load value from C
|
||||
fmuld d10, d10, d1 // multiply with beta
|
||||
add CO1, CO1, r3 // compute next pointer
|
||||
fmacd d10, d0, d18 // multiply sum with alpha and add to value of C
|
||||
add CO2, CO2, r3 // compute next pointer
|
||||
|
||||
fldd d11, [ CO1 ] // load value from C
|
||||
fldd d15, [ CO2 ] // load value from C
|
||||
fmuld d11, d11, d1 // multiply with beta
|
||||
mov CO1, r2 // restore pointer
|
||||
fmacd d11, d0, d19 // multiply sum with alpha and add to value of C
|
||||
mov CO2, r4 // restore pointer
|
||||
|
||||
fstd d8, [ CO1 ] // store value in C
|
||||
add CO1 , CO1, r3 // compute next pointer
|
||||
fstd d9, [ CO1 ] // store value in C
|
||||
add CO1 , CO1, r3 // compute next pointer
|
||||
fstd d10, [ CO1 ] // store value in C
|
||||
add CO1 , CO1, r3 // compute next pointer
|
||||
fstd d11, [ CO1 ] // store value in C
|
||||
|
||||
//-----------------------------------------------------------
|
||||
mov r2, CO3 // save pointer
|
||||
fldd d8, [ CO3 ] // load value from C
|
||||
fmuld d12, d12, d1 // multiply with beta
|
||||
add CO3, CO3, r3 // compute next pointer
|
||||
fmacd d12, d0, d20 // multiply sum with alpha and add to value of C
|
||||
|
||||
fldd d9, [ CO3 ] // load value from C
|
||||
fmuld d13, d13, d1 // multiply with beta
|
||||
add CO3, CO3, r3 // compute next pointer
|
||||
fmacd d13, d0, d21 // multiply sum with alpha and add to value of C
|
||||
|
||||
fldd d10, [ CO3 ] // load value from C
|
||||
fmuld d14, d14, d1 // multiply with beta
|
||||
add CO3, CO3, r3 // compute next pointer
|
||||
fmacd d14, d0, d22 // multiply sum with alpha and add to value of C
|
||||
|
||||
fldd d11, [ CO3 ] // load value from C
|
||||
fmuld d15, d15, d1 // multiply with beta
|
||||
mov CO3, r2 // restore pointer
|
||||
fmacd d15, d0, d23 // multiply sum with alpha and add to value of C
|
||||
|
||||
fstd d12, [ CO2 ] // store value in C
|
||||
add CO2 , CO2, r3 // compute next pointer
|
||||
fstd d13, [ CO2 ] // store value in C
|
||||
add CO2 , CO2, r3 // compute next pointer
|
||||
fstd d14, [ CO2 ] // store value in C
|
||||
add CO2 , CO2, r3 // compute next pointer
|
||||
fstd d15, [ CO2 ] // store value in C
|
||||
|
||||
//-----------------------------------------------------------
|
||||
mov r4, CO4 // save pointer
|
||||
fldd d12, [ CO4 ] // load value from C
|
||||
fmuld d8, d8, d1 // multiply with beta
|
||||
add CO4, CO4, r3 // compute next pointer
|
||||
fmacd d8, d0, d24 // multiply sum with alpha and add to value of C
|
||||
|
||||
fldd d13, [ CO4 ] // load value from C
|
||||
fmuld d9, d9, d1 // multiply with beta
|
||||
add CO4, CO4, r3 // compute next pointer
|
||||
fmacd d9, d0, d25 // multiply sum with alpha and add to value of C
|
||||
|
||||
fldd d14, [ CO4 ] // load value from C
|
||||
fmuld d10, d10, d1 // multiply with beta
|
||||
add CO4, CO4, r3 // compute next pointer
|
||||
fmacd d10, d0, d26 // multiply sum with alpha and add to value of C
|
||||
|
||||
fldd d15, [ CO4 ] // load value from C
|
||||
fmuld d11, d11, d1 // multiply with beta
|
||||
mov CO4, r4 // restore pointer
|
||||
fmacd d11, d0, d27 // multiply sum with alpha and add to value of C
|
||||
|
||||
|
||||
//-----------------------------------------------------------
|
||||
fstd d8, [ CO3 ] // store value in C
|
||||
fmuld d12, d12, d1 // multiply with beta
|
||||
add CO3 , CO3, r3 // compute next pointer
|
||||
fmacd d12, d0, d28 // multiply sum with alpha and add to value of C
|
||||
|
||||
fstd d9, [ CO3 ] // store value in C
|
||||
fmuld d13, d13, d1 // multiply with beta
|
||||
add CO3 , CO3, r3 // compute next pointer
|
||||
fmacd d13, d0, d29 // multiply sum with alpha and add to value of C
|
||||
|
||||
fstd d10, [ CO3 ] // store value in C
|
||||
fmuld d14, d14, d1 // multiply with beta
|
||||
add CO3 , CO3, r3 // compute next pointer
|
||||
fmacd d14, d0, d30 // multiply sum with alpha and add to value of C
|
||||
|
||||
fstd d11, [ CO3 ] // store value in C
|
||||
fmuld d15, d15, d1 // multiply with beta
|
||||
fstd d12, [ CO4 ] // store value in C
|
||||
fmacd d15, d0, d31 // multiply sum with alpha and add to value of C
|
||||
|
||||
add CO4 , CO4, r3 // compute next pointer
|
||||
fstd d13, [ CO4 ] // store value in C
|
||||
add CO4 , CO4, r3 // compute next pointer
|
||||
fstd d14, [ CO4 ] // store value in C
|
||||
add CO4 , CO4, r3 // compute next pointer
|
||||
fstd d15, [ CO4 ] // store value in C
|
||||
|
||||
.endm
|
||||
|
||||
|
||||
/**************************************************************************************
|
||||
* End of macro definitions
|
||||
**************************************************************************************/
|
||||
|
||||
.arm
|
||||
.global REALNAME
|
||||
.func REALNAME
|
||||
|
||||
REALNAME:
|
||||
|
||||
push {r4 - r9, fp} // save register
|
||||
add fp, sp, #28 // add number of saved register multiplied by size of int
|
||||
sub sp, sp, #STACKSIZE // reserve stack
|
||||
|
||||
mov AO, OLD_A // pointer matrix A
|
||||
mov BO, OLD_B // pointer matrix B
|
||||
|
||||
sub r3, fp, #128
|
||||
vstm r3, { d8 - d15} // store floating point registers
|
||||
|
||||
ldr r2, OLD_C // pointer matrix C
|
||||
ldr r3, OLD_CSC // Col stride size of C
|
||||
lsl r3, r3, #3 // multiply with size of double
|
||||
|
||||
mov CO1, r2 // first line of C
|
||||
add CO2, CO1, r3 // second line of C
|
||||
add CO3, CO2, r3 // third line of C
|
||||
add CO4, CO3, r3 // fourth line of C
|
||||
|
||||
pld [ CO1, #C_PRE ] // prefetch the lines of C
|
||||
pld [ CO2, #C_PRE ] // prefetch the lines of C
|
||||
pld [ CO3, #C_PRE ] // prefetch the lines of C
|
||||
pld [ CO3, #C_PRE ] // prefetch the lines of C
|
||||
|
||||
dgemm_kernel_L4_M4_20:
|
||||
|
||||
asrs L , K, #3 // L = K / 8
|
||||
cmp L , #2
|
||||
blt dgemm_kernel_L4_M4_32
|
||||
|
||||
KERNEL4x4_I
|
||||
KERNEL4x4_M2
|
||||
KERNEL4x4_M1
|
||||
KERNEL4x4_M2
|
||||
|
||||
KERNEL4x4_M1
|
||||
KERNEL4x4_M2
|
||||
KERNEL4x4_M1
|
||||
KERNEL4x4_M2
|
||||
|
||||
subs L, L, #2
|
||||
ble dgemm_kernel_L4_M4_22a
|
||||
.align 5
|
||||
|
||||
dgemm_kernel_L4_M4_22:
|
||||
|
||||
KERNEL4x4_M1
|
||||
KERNEL4x4_M2
|
||||
KERNEL4x4_M1
|
||||
KERNEL4x4_M2
|
||||
|
||||
KERNEL4x4_M1
|
||||
KERNEL4x4_M2
|
||||
KERNEL4x4_M1
|
||||
KERNEL4x4_M2
|
||||
|
||||
subs L, L, #1
|
||||
bgt dgemm_kernel_L4_M4_22
|
||||
|
||||
dgemm_kernel_L4_M4_22a:
|
||||
|
||||
KERNEL4x4_M1
|
||||
KERNEL4x4_M2
|
||||
KERNEL4x4_M1
|
||||
KERNEL4x4_M2
|
||||
|
||||
KERNEL4x4_M1
|
||||
KERNEL4x4_M2
|
||||
KERNEL4x4_M1
|
||||
KERNEL4x4_E
|
||||
|
||||
b dgemm_kernel_L4_M4_44
|
||||
|
||||
dgemm_kernel_L4_M4_32:
|
||||
|
||||
tst L, #1
|
||||
ble dgemm_kernel_L4_M4_40
|
||||
|
||||
KERNEL4x4_I
|
||||
KERNEL4x4_M2
|
||||
KERNEL4x4_M1
|
||||
KERNEL4x4_M2
|
||||
|
||||
KERNEL4x4_M1
|
||||
KERNEL4x4_M2
|
||||
KERNEL4x4_M1
|
||||
KERNEL4x4_E
|
||||
|
||||
b dgemm_kernel_L4_M4_44
|
||||
|
||||
dgemm_kernel_L4_M4_40:
|
||||
|
||||
INIT4x4
|
||||
|
||||
dgemm_kernel_L4_M4_44:
|
||||
|
||||
ands L , K, #7 // L = K % 8
|
||||
ble dgemm_kernel_L4_M4_100
|
||||
|
||||
dgemm_kernel_L4_M4_46:
|
||||
|
||||
KERNEL4x4_SUB
|
||||
|
||||
subs L, L, #1
|
||||
bne dgemm_kernel_L4_M4_46
|
||||
|
||||
dgemm_kernel_L4_M4_100:
|
||||
|
||||
SAVE4x4
|
||||
|
||||
dgemm_kernel_L999:
|
||||
|
||||
sub r3, fp, #128
|
||||
vldm r3, { d8 - d15} // restore floating point registers
|
||||
|
||||
sub sp, fp, #28
|
||||
pop {r4 - r9, fp}
|
||||
bx lr
|
||||
|
||||
134
config/armv7a/kernels/3/bli_gemm_opt_4x4.c
Normal file
134
config/armv7a/kernels/3/bli_gemm_opt_4x4.c
Normal file
@@ -0,0 +1,134 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2012, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
extern void bli_sgemm_kernel_4x4(dim_t k,
|
||||
float* alpha,
|
||||
float* restrict a,
|
||||
float* restrict b,
|
||||
float* beta,
|
||||
float* restrict c, inc_t rs_c, inc_t cs_c,
|
||||
auxinfo_t* data
|
||||
);
|
||||
|
||||
|
||||
void bli_sgemm_opt_4x4(
|
||||
dim_t k,
|
||||
float* restrict alpha,
|
||||
float* restrict a,
|
||||
float* restrict b,
|
||||
float* restrict beta,
|
||||
float* restrict c, inc_t rs_c, inc_t cs_c,
|
||||
auxinfo_t* data
|
||||
)
|
||||
{
|
||||
|
||||
bli_sgemm_kernel_4x4(k, alpha, a, b, beta, c, rs_c, cs_c, data);
|
||||
|
||||
}
|
||||
|
||||
extern void bli_dgemm_kernel_4x4(dim_t k,
|
||||
double* alpha,
|
||||
double* restrict a,
|
||||
double* restrict b,
|
||||
double* beta,
|
||||
double* restrict c, inc_t rs_c, inc_t cs_c,
|
||||
auxinfo_t* data
|
||||
);
|
||||
|
||||
|
||||
void bli_dgemm_opt_4x4(
|
||||
dim_t k,
|
||||
double* restrict alpha,
|
||||
double* restrict a,
|
||||
double* restrict b,
|
||||
double* restrict beta,
|
||||
double* restrict c, inc_t rs_c, inc_t cs_c,
|
||||
auxinfo_t* data
|
||||
)
|
||||
{
|
||||
bli_dgemm_kernel_4x4(k, alpha, a, b, beta, c, rs_c, cs_c, data);
|
||||
}
|
||||
|
||||
extern void bli_cgemm_kernel_2x2(dim_t k,
|
||||
scomplex* alpha,
|
||||
scomplex* restrict a,
|
||||
scomplex* restrict b,
|
||||
scomplex* beta,
|
||||
scomplex* restrict c, inc_t rs_c, inc_t cs_c,
|
||||
auxinfo_t* data
|
||||
);
|
||||
|
||||
|
||||
|
||||
|
||||
void bli_cgemm_opt_4x4(
|
||||
dim_t k,
|
||||
scomplex* restrict alpha,
|
||||
scomplex* restrict a,
|
||||
scomplex* restrict b,
|
||||
scomplex* restrict beta,
|
||||
scomplex* restrict c, inc_t rs_c, inc_t cs_c,
|
||||
auxinfo_t* data
|
||||
)
|
||||
{
|
||||
|
||||
bli_cgemm_kernel_2x2(k, alpha, a, b, beta, c, rs_c, cs_c, data);
|
||||
}
|
||||
|
||||
extern void bli_zgemm_kernel_2x2(dim_t k,
|
||||
dcomplex* alpha,
|
||||
dcomplex* restrict a,
|
||||
dcomplex* restrict b,
|
||||
dcomplex* beta,
|
||||
dcomplex* restrict c, inc_t rs_c, inc_t cs_c,
|
||||
auxinfo_t* data
|
||||
);
|
||||
|
||||
|
||||
void bli_zgemm_opt_4x4(
|
||||
dim_t k,
|
||||
dcomplex* restrict alpha,
|
||||
dcomplex* restrict a,
|
||||
dcomplex* restrict b,
|
||||
dcomplex* restrict beta,
|
||||
dcomplex* restrict c, inc_t rs_c, inc_t cs_c,
|
||||
auxinfo_t* data
|
||||
)
|
||||
{
|
||||
|
||||
bli_zgemm_kernel_2x2(k, alpha, a, b, beta, c, rs_c, cs_c, data);
|
||||
}
|
||||
|
||||
53
config/armv7a/kernels/3/bli_gemm_opt_4x4.h
Normal file
53
config/armv7a/kernels/3/bli_gemm_opt_4x4.h
Normal file
@@ -0,0 +1,53 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
// #include "arm_neon.h"
|
||||
|
||||
|
||||
#undef GENTPROT
|
||||
#define GENTPROT( ctype, ch, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
dim_t k, \
|
||||
ctype* restrict alpha, \
|
||||
ctype* restrict a, \
|
||||
ctype* restrict b, \
|
||||
ctype* restrict beta, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
auxinfo_t* data \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT_BASIC( gemm_opt_4x4 )
|
||||
|
||||
483
config/armv7a/kernels/3/bli_sgemm_kernel_4x4.S
Normal file
483
config/armv7a/kernels/3/bli_sgemm_kernel_4x4.S
Normal file
@@ -0,0 +1,483 @@
|
||||
|
||||
#define REALNAME bli_sgemm_kernel_4x4
|
||||
|
||||
#define STACKSIZE 256
|
||||
|
||||
#define K r0
|
||||
#define PTR_ALPHA r1
|
||||
#define OLD_A r2
|
||||
#define OLD_B r3
|
||||
#define PTR_BETA [fp, #0 ]
|
||||
#define OLD_C [fp, #4 ]
|
||||
#define OLD_RSC [fp, #8 ]
|
||||
#define OLD_CSC [fp, #12 ]
|
||||
#define AUX [fp, #16 ]
|
||||
|
||||
/******************************************************
|
||||
* [fp, #-128] - [fp, #-64] is reserved
|
||||
* for store and restore of floating point
|
||||
* register
|
||||
*******************************************************/
|
||||
|
||||
#define L r2
|
||||
|
||||
#define AO r5
|
||||
#define BO r6
|
||||
|
||||
#define CO1 r7
|
||||
#define CO2 r8
|
||||
#define CO3 r9
|
||||
#define CO4 r12
|
||||
|
||||
|
||||
#define A_PRE 96
|
||||
#define B_PRE 96
|
||||
#define C_PRE 0
|
||||
|
||||
/**************************************************************************************
|
||||
* Macro definitions
|
||||
**************************************************************************************/
|
||||
|
||||
.macro INIT4x4
|
||||
|
||||
vsub.f32 s16 , s16 , s16
|
||||
vmov.f32 s17, s16
|
||||
vmov.f32 s18, s16
|
||||
vmov.f32 s19, s16
|
||||
vmov.f32 s20, s16
|
||||
vmov.f32 s21, s16
|
||||
vmov.f32 s22, s16
|
||||
vmov.f32 s23, s16
|
||||
vmov.f32 s24, s16
|
||||
vmov.f32 s25, s16
|
||||
vmov.f32 s26, s16
|
||||
vmov.f32 s27, s16
|
||||
vmov.f32 s28, s16
|
||||
vmov.f32 s29, s16
|
||||
vmov.f32 s30, s16
|
||||
vmov.f32 s31, s16
|
||||
|
||||
.endm
|
||||
|
||||
.macro KERNEL4x4_I
|
||||
|
||||
pld [ AO , #A_PRE ]
|
||||
fldmias AO!, { s0 - s1 }
|
||||
pld [ BO , #B_PRE ]
|
||||
fldmias BO!, { s8 - s9 }
|
||||
|
||||
fmuls s16 , s0, s8
|
||||
fldmias AO!, { s2 - s3 }
|
||||
fmuls s17 , s1, s8
|
||||
fmuls s18 , s2, s8
|
||||
fldmias BO!, { s10 - s11 }
|
||||
fmuls s19 , s3, s8
|
||||
|
||||
fmuls s20 , s0, s9
|
||||
fldmias AO!, { s4 - s5 }
|
||||
fmuls s21 , s1, s9
|
||||
fmuls s22 , s2, s9
|
||||
fldmias AO!, { s6 - s7 }
|
||||
fmuls s23 , s3, s9
|
||||
|
||||
fmuls s24 , s0, s10
|
||||
fldmias BO!, { s12 - s13 }
|
||||
fmuls s25 , s1, s10
|
||||
fmuls s26 , s2, s10
|
||||
fldmias BO!, { s14 - s15 }
|
||||
fmuls s27 , s3, s10
|
||||
|
||||
fmuls s28 , s0, s11
|
||||
fmuls s29 , s1, s11
|
||||
fmuls s30 , s2, s11
|
||||
fmuls s31 , s3, s11
|
||||
|
||||
.endm
|
||||
|
||||
|
||||
.macro KERNEL4x4_M2
|
||||
|
||||
pld [ AO , #A_PRE ]
|
||||
fmacs s16 , s4, s12
|
||||
fmacs s17 , s5, s12
|
||||
fldmias AO!, { s0 - s3 }
|
||||
fmacs s18 , s6, s12
|
||||
pld [ BO , #B_PRE ]
|
||||
fmacs s19 , s7, s12
|
||||
|
||||
fmacs s20 , s4, s13
|
||||
fldmias BO!, { s8 - s11 }
|
||||
fmacs s21 , s5, s13
|
||||
fmacs s22 , s6, s13
|
||||
//fldmias AO!, { s2 - s3 }
|
||||
fmacs s23 , s7, s13
|
||||
|
||||
fmacs s24 , s4, s14
|
||||
//fldmias BO!, { s10 - s11 }
|
||||
fmacs s25 , s5, s14
|
||||
fmacs s26 , s6, s14
|
||||
fmacs s27 , s7, s14
|
||||
|
||||
fmacs s28 , s4, s15
|
||||
fmacs s29 , s5, s15
|
||||
fmacs s30 , s6, s15
|
||||
fmacs s31 , s7, s15
|
||||
|
||||
.endm
|
||||
|
||||
|
||||
.macro KERNEL4x4_M1
|
||||
|
||||
fmacs s16 , s0, s8
|
||||
fldmias AO!, { s4 - s7 }
|
||||
fmacs s17 , s1, s8
|
||||
fmacs s18 , s2, s8
|
||||
fldmias BO!, { s12 - s15 }
|
||||
//fldmias AO!, { s6 - s7 }
|
||||
fmacs s19 , s3, s8
|
||||
|
||||
fmacs s20 , s0, s9
|
||||
fmacs s21 , s1, s9
|
||||
fmacs s22 , s2, s9
|
||||
//fldmias BO!, { s14 - s15 }
|
||||
fmacs s23 , s3, s9
|
||||
|
||||
fmacs s24 , s0, s10
|
||||
fmacs s25 , s1, s10
|
||||
fmacs s26 , s2, s10
|
||||
fmacs s27 , s3, s10
|
||||
|
||||
fmacs s28 , s0, s11
|
||||
fmacs s29 , s1, s11
|
||||
fmacs s30 , s2, s11
|
||||
fmacs s31 , s3, s11
|
||||
|
||||
.endm
|
||||
|
||||
|
||||
|
||||
.macro KERNEL4x4_E
|
||||
|
||||
fmacs s16 , s4, s12
|
||||
fmacs s17 , s5, s12
|
||||
fmacs s18 , s6, s12
|
||||
fmacs s19 , s7, s12
|
||||
|
||||
fmacs s20 , s4, s13
|
||||
fmacs s21 , s5, s13
|
||||
fmacs s22 , s6, s13
|
||||
fmacs s23 , s7, s13
|
||||
|
||||
fmacs s24 , s4, s14
|
||||
fmacs s25 , s5, s14
|
||||
fmacs s26 , s6, s14
|
||||
fmacs s27 , s7, s14
|
||||
|
||||
fmacs s28 , s4, s15
|
||||
fmacs s29 , s5, s15
|
||||
fmacs s30 , s6, s15
|
||||
fmacs s31 , s7, s15
|
||||
|
||||
.endm
|
||||
|
||||
|
||||
|
||||
|
||||
.macro KERNEL4x4_SUB
|
||||
|
||||
flds s8 , [ BO ]
|
||||
|
||||
flds s0 , [ AO ]
|
||||
flds s1 , [ AO, #4 ]
|
||||
|
||||
fmacs s16 , s0, s8
|
||||
flds s2 , [ AO, #8 ]
|
||||
fmacs s17 , s1, s8
|
||||
flds s3 , [ AO, #12 ]
|
||||
fmacs s18 , s2, s8
|
||||
flds s9 , [ BO, #4 ]
|
||||
fmacs s19 , s3, s8
|
||||
|
||||
flds s10, [ BO, #8 ]
|
||||
fmacs s20 , s0, s9
|
||||
flds s11, [ BO, #12 ]
|
||||
fmacs s21 , s1, s9
|
||||
fmacs s22 , s2, s9
|
||||
fmacs s23 , s3, s9
|
||||
|
||||
fmacs s24 , s0, s10
|
||||
fmacs s25 , s1, s10
|
||||
fmacs s26 , s2, s10
|
||||
fmacs s27 , s3, s10
|
||||
|
||||
fmacs s28 , s0, s11
|
||||
fmacs s29 , s1, s11
|
||||
add AO , AO, #16
|
||||
fmacs s30 , s2, s11
|
||||
add BO , BO, #16
|
||||
fmacs s31 , s3, s11
|
||||
|
||||
.endm
|
||||
|
||||
|
||||
.macro SAVE4x4
|
||||
|
||||
ldr r3, OLD_RSC // Row stride size
|
||||
lsl r3, r3, #2 // multiply with size of float
|
||||
|
||||
flds s0, [ PTR_ALPHA ] // load alpha
|
||||
ldr r4, PTR_BETA
|
||||
flds s1, [ r4 ] // load beta
|
||||
|
||||
//-----------------------------------------------------------
|
||||
mov r2, CO1 // save pointer
|
||||
mov r4, CO2 // save pointer
|
||||
flds s8, [ CO1 ] // load value from C
|
||||
flds s12, [ CO2 ] // load value from C
|
||||
fmuls s8, s8, s1 // multiply with beta
|
||||
add CO1, CO1, r3 // compute next pointer
|
||||
fmacs s8, s0, s16 // multiply sum with alpha and add to value of C
|
||||
add CO2, CO2, r3 // compute next pointer
|
||||
|
||||
flds s9, [ CO1 ] // load value from C
|
||||
flds s13, [ CO2 ] // load value from C
|
||||
fmuls s9, s9, s1 // multiply with beta
|
||||
add CO1, CO1, r3 // compute next pointer
|
||||
fmacs s9, s0, s17 // multiply sum with alpha and add to value of C
|
||||
add CO2, CO2, r3 // compute next pointer
|
||||
|
||||
flds s10, [ CO1 ] // load value from C
|
||||
flds s14, [ CO2 ] // load value from C
|
||||
fmuls s10, s10, s1 // multiply with beta
|
||||
add CO1, CO1, r3 // compute next pointer
|
||||
fmacs s10, s0, s18 // multiply sum with alpha and add to value of C
|
||||
add CO2, CO2, r3 // compute next pointer
|
||||
|
||||
flds s11, [ CO1 ] // load value from C
|
||||
flds s15, [ CO2 ] // load value from C
|
||||
fmuls s11, s11, s1 // multiply with beta
|
||||
mov CO1, r2 // restore pointer
|
||||
fmacs s11, s0, s19 // multiply sum with alpha and add to value of C
|
||||
mov CO2, r4 // restore pointer
|
||||
|
||||
fsts s8, [ CO1 ] // store value in C
|
||||
add CO1 , CO1, r3 // compute next pointer
|
||||
fsts s9, [ CO1 ] // store value in C
|
||||
add CO1 , CO1, r3 // compute next pointer
|
||||
fsts s10, [ CO1 ] // store value in C
|
||||
add CO1 , CO1, r3 // compute next pointer
|
||||
fsts s11, [ CO1 ] // store value in C
|
||||
|
||||
//-----------------------------------------------------------
|
||||
mov r2, CO3 // save pointer
|
||||
flds s8, [ CO3 ] // load value from C
|
||||
fmuls s12, s12, s1 // multiply with beta
|
||||
add CO3, CO3, r3 // compute next pointer
|
||||
fmacs s12, s0, s20 // multiply sum with alpha and add to value of C
|
||||
|
||||
flds s9, [ CO3 ] // load value from C
|
||||
fmuls s13, s13, s1 // multiply with beta
|
||||
add CO3, CO3, r3 // compute next pointer
|
||||
fmacs s13, s0, s21 // multiply sum with alpha and add to value of C
|
||||
|
||||
flds s10, [ CO3 ] // load value from C
|
||||
fmuls s14, s14, s1 // multiply with beta
|
||||
add CO3, CO3, r3 // compute next pointer
|
||||
fmacs s14, s0, s22 // multiply sum with alpha and add to value of C
|
||||
|
||||
flds s11, [ CO3 ] // load value from C
|
||||
fmuls s15, s15, s1 // multiply with beta
|
||||
mov CO3, r2 // restore pointer
|
||||
fmacs s15, s0, s23 // multiply sum with alpha and add to value of C
|
||||
|
||||
fsts s12, [ CO2 ] // store value in C
|
||||
add CO2 , CO2, r3 // compute next pointer
|
||||
fsts s13, [ CO2 ] // store value in C
|
||||
add CO2 , CO2, r3 // compute next pointer
|
||||
fsts s14, [ CO2 ] // store value in C
|
||||
add CO2 , CO2, r3 // compute next pointer
|
||||
fsts s15, [ CO2 ] // store value in C
|
||||
|
||||
//-----------------------------------------------------------
|
||||
mov r4, CO4 // save pointer
|
||||
flds s12, [ CO4 ] // load value from C
|
||||
fmuls s8, s8, s1 // multiply with beta
|
||||
add CO4, CO4, r3 // compute next pointer
|
||||
fmacs s8, s0, s24 // multiply sum with alpha and add to value of C
|
||||
|
||||
flds s13, [ CO4 ] // load value from C
|
||||
fmuls s9, s9, s1 // multiply with beta
|
||||
add CO4, CO4, r3 // compute next pointer
|
||||
fmacs s9, s0, s25 // multiply sum with alpha and add to value of C
|
||||
|
||||
flds s14, [ CO4 ] // load value from C
|
||||
fmuls s10, s10, s1 // multiply with beta
|
||||
add CO4, CO4, r3 // compute next pointer
|
||||
fmacs s10, s0, s26 // multiply sum with alpha and add to value of C
|
||||
|
||||
flds s15, [ CO4 ] // load value from C
|
||||
fmuls s11, s11, s1 // multiply with beta
|
||||
mov CO4, r4 // restore pointer
|
||||
fmacs s11, s0, s27 // multiply sum with alpha and add to value of C
|
||||
|
||||
|
||||
//-----------------------------------------------------------
|
||||
fsts s8, [ CO3 ] // store value in C
|
||||
fmuls s12, s12, s1 // multiply with beta
|
||||
add CO3 , CO3, r3 // compute next pointer
|
||||
fmacs s12, s0, s28 // multiply sum with alpha and add to value of C
|
||||
|
||||
fsts s9, [ CO3 ] // store value in C
|
||||
fmuls s13, s13, s1 // multiply with beta
|
||||
add CO3 , CO3, r3 // compute next pointer
|
||||
fmacs s13, s0, s29 // multiply sum with alpha and add to value of C
|
||||
|
||||
fsts s10, [ CO3 ] // store value in C
|
||||
fmuls s14, s14, s1 // multiply with beta
|
||||
add CO3 , CO3, r3 // compute next pointer
|
||||
fmacs s14, s0, s30 // multiply sum with alpha and add to value of C
|
||||
|
||||
fsts s11, [ CO3 ] // store value in C
|
||||
fmuls s15, s15, s1 // multiply with beta
|
||||
fsts s12, [ CO4 ] // store value in C
|
||||
fmacs s15, s0, s31 // multiply sum with alpha and add to value of C
|
||||
|
||||
add CO4 , CO4, r3 // compute next pointer
|
||||
fsts s13, [ CO4 ] // store value in C
|
||||
add CO4 , CO4, r3 // compute next pointer
|
||||
fsts s14, [ CO4 ] // store value in C
|
||||
add CO4 , CO4, r3 // compute next pointer
|
||||
fsts s15, [ CO4 ] // store value in C
|
||||
|
||||
.endm
|
||||
|
||||
|
||||
/**************************************************************************************
|
||||
* End of macro definitions
|
||||
**************************************************************************************/
|
||||
|
||||
.arm
|
||||
.global REALNAME
|
||||
.func REALNAME
|
||||
|
||||
REALNAME:
|
||||
|
||||
push {r4 - r9, fp} // save register
|
||||
add fp, sp, #28 // add number of saved register multiplied by size of int
|
||||
sub sp, sp, #STACKSIZE // reserve stack
|
||||
|
||||
mov AO, OLD_A // pointer matrix A
|
||||
mov BO, OLD_B // pointer matrix B
|
||||
|
||||
sub r3, fp, #128
|
||||
vstm r3, { s8 - s31 } // store floating point registers
|
||||
|
||||
ldr r2, OLD_C // pointer matrix C
|
||||
ldr r3, OLD_CSC // Col stride size of C
|
||||
lsl r3, r3, #2 // multiply with size of float
|
||||
|
||||
mov CO1, r2 // first line of C
|
||||
add CO2, CO1, r3 // second line of C
|
||||
add CO3, CO2, r3 // third line of C
|
||||
add CO4, CO3, r3 // fourth line of C
|
||||
|
||||
pld [ CO1, #C_PRE ] // prefetch the lines of C
|
||||
pld [ CO2, #C_PRE ] // prefetch the lines of C
|
||||
pld [ CO3, #C_PRE ] // prefetch the lines of C
|
||||
pld [ CO3, #C_PRE ] // prefetch the lines of C
|
||||
|
||||
sgemm_kernel_L4_M4_20:
|
||||
|
||||
asrs L , K, #3 // L = K / 8
|
||||
cmp L , #2
|
||||
blt sgemm_kernel_L4_M4_32
|
||||
|
||||
KERNEL4x4_I
|
||||
KERNEL4x4_M2
|
||||
KERNEL4x4_M1
|
||||
KERNEL4x4_M2
|
||||
|
||||
KERNEL4x4_M1
|
||||
KERNEL4x4_M2
|
||||
KERNEL4x4_M1
|
||||
KERNEL4x4_M2
|
||||
|
||||
subs L, L, #2
|
||||
ble sgemm_kernel_L4_M4_22a
|
||||
.align 5
|
||||
|
||||
sgemm_kernel_L4_M4_22:
|
||||
|
||||
KERNEL4x4_M1
|
||||
KERNEL4x4_M2
|
||||
KERNEL4x4_M1
|
||||
KERNEL4x4_M2
|
||||
|
||||
KERNEL4x4_M1
|
||||
KERNEL4x4_M2
|
||||
KERNEL4x4_M1
|
||||
KERNEL4x4_M2
|
||||
|
||||
subs L, L, #1
|
||||
bgt sgemm_kernel_L4_M4_22
|
||||
|
||||
sgemm_kernel_L4_M4_22a:
|
||||
|
||||
KERNEL4x4_M1
|
||||
KERNEL4x4_M2
|
||||
KERNEL4x4_M1
|
||||
KERNEL4x4_M2
|
||||
|
||||
KERNEL4x4_M1
|
||||
KERNEL4x4_M2
|
||||
KERNEL4x4_M1
|
||||
KERNEL4x4_E
|
||||
|
||||
b sgemm_kernel_L4_M4_44
|
||||
|
||||
sgemm_kernel_L4_M4_32:
|
||||
|
||||
tst L, #1
|
||||
ble sgemm_kernel_L4_M4_40
|
||||
|
||||
KERNEL4x4_I
|
||||
KERNEL4x4_M2
|
||||
KERNEL4x4_M1
|
||||
KERNEL4x4_M2
|
||||
|
||||
KERNEL4x4_M1
|
||||
KERNEL4x4_M2
|
||||
KERNEL4x4_M1
|
||||
KERNEL4x4_E
|
||||
|
||||
b sgemm_kernel_L4_M4_44
|
||||
|
||||
sgemm_kernel_L4_M4_40:
|
||||
|
||||
INIT4x4
|
||||
|
||||
sgemm_kernel_L4_M4_44:
|
||||
|
||||
ands L , K, #7 // L = K % 8
|
||||
ble sgemm_kernel_L4_M4_100
|
||||
|
||||
sgemm_kernel_L4_M4_46:
|
||||
|
||||
KERNEL4x4_SUB
|
||||
|
||||
subs L, L, #1
|
||||
bne sgemm_kernel_L4_M4_46
|
||||
|
||||
sgemm_kernel_L4_M4_100:
|
||||
|
||||
SAVE4x4
|
||||
|
||||
sgemm_kernel_L999:
|
||||
|
||||
sub r3, fp, #128
|
||||
vldm r3, { s8 - s31 } // restore floating point registers
|
||||
|
||||
sub sp, fp, #28
|
||||
pop {r4 - r9, fp}
|
||||
bx lr
|
||||
|
||||
506
config/armv7a/kernels/3/bli_zgemm_kernel_2x2.S
Normal file
506
config/armv7a/kernels/3/bli_zgemm_kernel_2x2.S
Normal file
@@ -0,0 +1,506 @@
|
||||
|
||||
#define REALNAME bli_zgemm_kernel_2x2
|
||||
|
||||
#define STACKSIZE 256
|
||||
|
||||
#define K r0
|
||||
#define PTR_ALPHA r1
|
||||
#define OLD_A r2
|
||||
#define OLD_B r3
|
||||
#define PTR_BETA [fp, #0 ]
|
||||
#define OLD_C [fp, #4 ]
|
||||
#define OLD_RSC [fp, #8 ]
|
||||
#define OLD_CSC [fp, #12 ]
|
||||
#define AUX [fp, #16 ]
|
||||
|
||||
/******************************************************
|
||||
* [fp, #-128] - [fp, #-64] is reserved
|
||||
* for store and restore of floating point
|
||||
* register
|
||||
*******************************************************/
|
||||
|
||||
#define L r2
|
||||
|
||||
#define AO r5
|
||||
#define BO r6
|
||||
|
||||
#define CO1 r7
|
||||
#define CO2 r8
|
||||
|
||||
|
||||
#define A_PRE 96
|
||||
#define B_PRE 96
|
||||
#define C_PRE 0
|
||||
|
||||
/**************************************************************************************
|
||||
* Macro definitions
|
||||
**************************************************************************************/
|
||||
|
||||
#define FMAC_BR fnmacd
|
||||
#define FMAC_BI fmacd
|
||||
|
||||
#define NN 1
|
||||
|
||||
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
|
||||
|
||||
#define FADD_R fsubd
|
||||
#define FADD_I faddd
|
||||
|
||||
#define FMAC_R1 fnmacd
|
||||
#define FMAC_R2 fnmacd
|
||||
#define FMAC_I1 fmacd
|
||||
#define FMAC_I2 fnmacd
|
||||
|
||||
#elif defined(CN) || defined(CT)
|
||||
|
||||
#define FADD_R faddd
|
||||
#define FADD_I fsubd
|
||||
|
||||
#define FMAC_R1 fmacd
|
||||
#define FMAC_R2 fmacd
|
||||
#define FMAC_I1 fnmacd
|
||||
#define FMAC_I2 fmacd
|
||||
|
||||
#elif defined(NC) || defined(TC)
|
||||
|
||||
#define FADD_R faddd
|
||||
#define FADD_I fsubd
|
||||
|
||||
#define FMAC_R1 fmacd
|
||||
#define FMAC_R2 fnmacd
|
||||
#define FMAC_I1 fmacd
|
||||
#define FMAC_I2 fmacd
|
||||
|
||||
#else
|
||||
|
||||
#define FADD_R fsubd
|
||||
#define FADD_I faddd
|
||||
|
||||
#define FMAC_R1 fnmacd
|
||||
#define FMAC_R2 fmacd
|
||||
#define FMAC_I1 fnmacd
|
||||
#define FMAC_I2 fnmacd
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
.macro INIT2x2
|
||||
|
||||
vsub.f64 d16 , d16 , d16
|
||||
vmov.f64 d17, d16
|
||||
vmov.f64 d18, d16
|
||||
vmov.f64 d19, d16
|
||||
vmov.f64 d20, d16
|
||||
vmov.f64 d21, d16
|
||||
vmov.f64 d22, d16
|
||||
vmov.f64 d23, d16
|
||||
vmov.f64 d24, d16
|
||||
vmov.f64 d25, d16
|
||||
vmov.f64 d26, d16
|
||||
vmov.f64 d27, d16
|
||||
vmov.f64 d28, d16
|
||||
vmov.f64 d29, d16
|
||||
vmov.f64 d30, d16
|
||||
vmov.f64 d31, d16
|
||||
|
||||
.endm
|
||||
|
||||
.macro KERNEL2x2_I
|
||||
pld [ AO , #A_PRE ]
|
||||
pld [ BO , #B_PRE ]
|
||||
fldd d0 , [ AO ]
|
||||
fldd d1 , [ AO, #8 ]
|
||||
fldd d8 , [ BO ]
|
||||
fldd d9 , [ BO, #8 ]
|
||||
|
||||
fmuld d16 , d0, d8
|
||||
fldd d2 , [ AO, #16 ]
|
||||
fmuld d24 , d1, d9
|
||||
fldd d3 , [ AO, #24 ]
|
||||
fmuld d17 , d0, d9
|
||||
fldd d10, [ BO, #16 ]
|
||||
fmuld d25 , d1, d8
|
||||
|
||||
fldd d11, [ BO, #24 ]
|
||||
fmuld d18 , d2, d8
|
||||
add BO , BO, #32
|
||||
fmuld d26 , d3, d9
|
||||
add AO , AO, #32
|
||||
fmuld d19 , d2, d9
|
||||
pld [ BO , #B_PRE ]
|
||||
fmuld d27 , d3, d8
|
||||
|
||||
pld [ AO , #A_PRE ]
|
||||
fmuld d20 , d0, d10
|
||||
fldd d4 , [ AO, #0 ]
|
||||
fmuld d28 , d1, d11
|
||||
fldd d5 , [ AO, #8 ]
|
||||
fmuld d21 , d0, d11
|
||||
fldd d12, [ BO ]
|
||||
fmuld d29 , d1, d10
|
||||
|
||||
fldd d13, [ BO, #8 ]
|
||||
fmuld d22 , d2, d10
|
||||
fldd d6 , [ AO, #16 ]
|
||||
fmuld d30 , d3, d11
|
||||
fldd d7 , [ AO, #24 ]
|
||||
fmuld d23 , d2, d11
|
||||
fldd d14, [ BO, #16 ]
|
||||
fmuld d31 , d3, d10
|
||||
fldd d15, [ BO, #24 ]
|
||||
|
||||
add BO , BO, #32
|
||||
add AO , AO, #32
|
||||
.endm
|
||||
|
||||
|
||||
|
||||
.macro KERNEL2x2_M1
|
||||
pld [ AO , #A_PRE ]
|
||||
|
||||
fmacd d16 , d0, d8
|
||||
pld [ BO , #B_PRE ]
|
||||
fmacd d24 , d1, d9
|
||||
fldd d4 , [ AO, #0 ]
|
||||
fmacd d17 , d0, d9
|
||||
fldd d5 , [ AO, #8 ]
|
||||
fmacd d25 , d1, d8
|
||||
|
||||
fldd d12, [ BO ]
|
||||
fmacd d18 , d2, d8
|
||||
fldd d13, [ BO, #8 ]
|
||||
fmacd d26 , d3, d9
|
||||
fldd d6 , [ AO, #16 ]
|
||||
fmacd d19 , d2, d9
|
||||
fldd d7 , [ AO, #24 ]
|
||||
fmacd d27 , d3, d8
|
||||
|
||||
fmacd d20 , d0, d10
|
||||
fldd d14, [ BO, #16 ]
|
||||
fmacd d28 , d1, d11
|
||||
fmacd d21 , d0, d11
|
||||
fldd d15, [ BO, #24 ]
|
||||
fmacd d29 , d1, d10
|
||||
|
||||
fmacd d22 , d2, d10
|
||||
add BO , BO, #32
|
||||
fmacd d30 , d3, d11
|
||||
fmacd d23 , d2, d11
|
||||
add AO , AO, #32
|
||||
fmacd d31 , d3, d10
|
||||
|
||||
.endm
|
||||
|
||||
.macro KERNEL2x2_M2
|
||||
pld [ AO , #A_PRE ]
|
||||
|
||||
fmacd d16 , d4, d12
|
||||
pld [ BO , #B_PRE ]
|
||||
fmacd d24 , d5, d13
|
||||
fldd d0 , [ AO, #0 ]
|
||||
fmacd d17 , d4, d13
|
||||
fldd d1 , [ AO, #8 ]
|
||||
fmacd d25 , d5, d12
|
||||
|
||||
fmacd d18 , d6, d12
|
||||
fldd d8 , [ BO ]
|
||||
fmacd d26 , d7, d13
|
||||
fldd d9 , [ BO, #8 ]
|
||||
fmacd d19 , d6, d13
|
||||
fmacd d27 , d7, d12
|
||||
|
||||
fldd d2 , [ AO, #16 ]
|
||||
fmacd d20 , d4, d14
|
||||
fldd d3 , [ AO, #24 ]
|
||||
fmacd d28 , d5, d15
|
||||
fmacd d21 , d4, d15
|
||||
fldd d10, [ BO, #16 ]
|
||||
fmacd d29 , d5, d14
|
||||
|
||||
fldd d11, [ BO, #24 ]
|
||||
fmacd d22 , d6, d14
|
||||
fmacd d30 , d7, d15
|
||||
add BO , BO, #32
|
||||
fmacd d23 , d6, d15
|
||||
add AO , AO, #32
|
||||
fmacd d31 , d7, d14
|
||||
|
||||
.endm
|
||||
|
||||
|
||||
.macro KERNEL2x2_E
|
||||
|
||||
fmacd d16 , d4, d12
|
||||
fmacd d24 , d5, d13
|
||||
fmacd d17 , d4, d13
|
||||
fmacd d25 , d5, d12
|
||||
|
||||
fmacd d18 , d6, d12
|
||||
fmacd d26 , d7, d13
|
||||
fmacd d19 , d6, d13
|
||||
fmacd d27 , d7, d12
|
||||
|
||||
fmacd d20 , d4, d14
|
||||
fmacd d28 , d5, d15
|
||||
fmacd d21 , d4, d15
|
||||
fmacd d29 , d5, d14
|
||||
|
||||
fmacd d22 , d6, d14
|
||||
fmacd d30 , d7, d15
|
||||
fmacd d23 , d6, d15
|
||||
fmacd d31 , d7, d14
|
||||
|
||||
.endm
|
||||
|
||||
.macro KERNEL2x2_SUB
|
||||
|
||||
pld [ AO , #A_PRE ]
|
||||
pld [ BO , #B_PRE ]
|
||||
fldd d0 , [ AO ]
|
||||
fldd d1 , [ AO, #8 ]
|
||||
fldd d8 , [ BO ]
|
||||
fldd d9 , [ BO, #8 ]
|
||||
|
||||
fmacd d16 , d0, d8
|
||||
fldd d2 , [ AO, #16 ]
|
||||
fmacd d24 , d1, d9
|
||||
fldd d3 , [ AO, #24 ]
|
||||
fmacd d17 , d0, d9
|
||||
fldd d10, [ BO, #16 ]
|
||||
fmacd d25 , d1, d8
|
||||
|
||||
fldd d11, [ BO, #24 ]
|
||||
fmacd d18 , d2, d8
|
||||
fmacd d26 , d3, d9
|
||||
fmacd d19 , d2, d9
|
||||
fmacd d27 , d3, d8
|
||||
|
||||
fmacd d20 , d0, d10
|
||||
fmacd d28 , d1, d11
|
||||
fmacd d21 , d0, d11
|
||||
fmacd d29 , d1, d10
|
||||
|
||||
fmacd d22 , d2, d10
|
||||
add BO , BO, #32
|
||||
fmacd d30 , d3, d11
|
||||
fmacd d23 , d2, d11
|
||||
add AO , AO, #32
|
||||
fmacd d31 , d3, d10
|
||||
|
||||
.endm
|
||||
|
||||
|
||||
|
||||
|
||||
.macro SAVE2x2
|
||||
|
||||
ldr r3, OLD_RSC // Row stride size
|
||||
lsl r3, r3, #4 // multiply with size of complex double
|
||||
|
||||
fldd d0, [ PTR_ALPHA ] // load real part of alpha
|
||||
fldd d1, [ PTR_ALPHA, #8 ] // load imag part of alpha
|
||||
ldr r4, PTR_BETA
|
||||
fldd d2, [ r4 ] // load real part of beta
|
||||
fldd d3, [ r4, #8 ] // load imag part of beta
|
||||
|
||||
// Add/Sub the real and the imag parts
|
||||
FADD_R d16, d24 , d16
|
||||
FADD_I d17, d25 , d17
|
||||
FADD_R d18, d26 , d18
|
||||
FADD_I d19, d27 , d19
|
||||
FADD_R d20, d28 , d20
|
||||
FADD_I d21, d29 , d21
|
||||
FADD_R d22, d30 , d22
|
||||
FADD_I d23, d31 , d23
|
||||
|
||||
mov r4, CO1 // save pointer
|
||||
fldmiad CO1, { d4 - d5 } // read real and imag part from C
|
||||
add CO1, CO1, r3
|
||||
|
||||
mov r2, CO2 // save pointer
|
||||
fldmiad CO2, { d8 - d9 } // read real and imag part from C
|
||||
add CO2, CO2, r3
|
||||
|
||||
fmuld d24, d4, d2 // multiply Beta-real with C-real
|
||||
fmuld d25, d5, d2 // multiply Beta-real with C-imag
|
||||
fmuld d28, d8, d2 // multiply Beta-real with C-real
|
||||
fmuld d29, d9, d2 // multiply Beta-real with C-imag
|
||||
|
||||
FMAC_BR d24, d3, d5 // multiply beta-imag with C-imag and add
|
||||
FMAC_BI d25, d3, d4 // multiply beta-imag with C-real and add
|
||||
FMAC_BR d28, d3, d9 // multiply beta-imag with C-imag and add
|
||||
FMAC_BI d29, d3, d8 // multiply beta-imag with C-real and add
|
||||
|
||||
FMAC_R1 d24 , d0 , d16
|
||||
FMAC_I1 d25 , d0 , d17
|
||||
FMAC_R2 d24 , d1 , d17
|
||||
FMAC_I2 d25 , d1 , d16
|
||||
|
||||
FMAC_R1 d28 , d0 , d20
|
||||
FMAC_I1 d29 , d0 , d21
|
||||
FMAC_R2 d28 , d1 , d21
|
||||
FMAC_I2 d29 , d1 , d20
|
||||
|
||||
fldmiad CO1, { d4 - d5 } // read real and imag part from C
|
||||
fldmiad CO2, { d8 - d9 } // read real and imag part from C
|
||||
|
||||
fmuld d26, d4, d2 // multiply Beta-real with C-real
|
||||
fmuld d27, d5, d2 // multiply Beta-real with C-imag
|
||||
fmuld d30, d8, d2 // multiply Beta-real with C-real
|
||||
fmuld d31, d9, d2 // multiply Beta-real with C-imag
|
||||
|
||||
FMAC_BR d26, d3, d5 // multiply beta-imag with C-imag and add
|
||||
FMAC_BI d27, d3, d4 // multiply beta-imag with C-real and add
|
||||
FMAC_BR d30, d3, d9 // multiply beta-imag with C-imag and add
|
||||
FMAC_BI d31, d3, d8 // multiply beta-imag with C-real and add
|
||||
|
||||
FMAC_R1 d26 , d0 , d18
|
||||
FMAC_I1 d27 , d0 , d19
|
||||
FMAC_R2 d26 , d1 , d19
|
||||
FMAC_I2 d27 , d1 , d18
|
||||
|
||||
FMAC_R1 d30, d0 , d22
|
||||
FMAC_I1 d31, d0 , d23
|
||||
FMAC_R2 d30, d1 , d23
|
||||
FMAC_I2 d31, d1 , d22
|
||||
|
||||
mov CO1, r4 // restore pointer
|
||||
mov CO2, r2 // restore pointer
|
||||
fstmiad CO1, { d24 - d25 }
|
||||
fstmiad CO2, { d28 - d29 }
|
||||
add CO1, CO1, r3
|
||||
add CO2, CO2, r3
|
||||
fstmiad CO1, { d26 - d27 }
|
||||
fstmiad CO2, { d30 - d31 }
|
||||
|
||||
|
||||
.endm
|
||||
|
||||
|
||||
|
||||
/**************************************************************************************
|
||||
* End of macro definitions
|
||||
**************************************************************************************/
|
||||
|
||||
.arm
|
||||
.global REALNAME
|
||||
.func REALNAME
|
||||
|
||||
REALNAME:
|
||||
|
||||
push {r4 - r9, fp} // save register
|
||||
add fp, sp, #28 // add number of saved register multiplied by size of int
|
||||
sub sp, sp, #STACKSIZE // reserve stack
|
||||
|
||||
mov AO, OLD_A // pointer matrix A
|
||||
mov BO, OLD_B // pointer matrix B
|
||||
|
||||
sub r3, fp, #128
|
||||
vstm r3, { d8 - d15} // store floating point registers
|
||||
|
||||
ldr r2, OLD_C // pointer matrix C
|
||||
ldr r3, OLD_CSC // Col stride size of C
|
||||
lsl r3, r3, #4 // multiply with size of complex double
|
||||
|
||||
mov CO1, r2 // first line of C
|
||||
add CO2, CO1, r3 // second line of C
|
||||
|
||||
pld [ CO1, #C_PRE ] // prefetch the lines of C
|
||||
pld [ CO2, #C_PRE ] // prefetch the lines of C
|
||||
|
||||
zgemm_kernel_L2_M2_20:
|
||||
|
||||
asrs L , K, #3 // L = K / 8
|
||||
cmp L , #2
|
||||
blt zgemm_kernel_L2_M2_32
|
||||
|
||||
KERNEL2x2_I
|
||||
KERNEL2x2_M2
|
||||
KERNEL2x2_M1
|
||||
KERNEL2x2_M2
|
||||
|
||||
KERNEL2x2_M1
|
||||
KERNEL2x2_M2
|
||||
KERNEL2x2_M1
|
||||
KERNEL2x2_M2
|
||||
|
||||
subs L, L, #2
|
||||
ble zgemm_kernel_L2_M2_22a
|
||||
.align 5
|
||||
|
||||
zgemm_kernel_L2_M2_22:
|
||||
|
||||
KERNEL2x2_M1
|
||||
KERNEL2x2_M2
|
||||
KERNEL2x2_M1
|
||||
KERNEL2x2_M2
|
||||
|
||||
KERNEL2x2_M1
|
||||
KERNEL2x2_M2
|
||||
KERNEL2x2_M1
|
||||
KERNEL2x2_M2
|
||||
|
||||
subs L, L, #1
|
||||
bgt zgemm_kernel_L2_M2_22
|
||||
|
||||
zgemm_kernel_L2_M2_22a:
|
||||
|
||||
KERNEL2x2_M1
|
||||
KERNEL2x2_M2
|
||||
KERNEL2x2_M1
|
||||
KERNEL2x2_M2
|
||||
|
||||
KERNEL2x2_M1
|
||||
KERNEL2x2_M2
|
||||
KERNEL2x2_M1
|
||||
KERNEL2x2_E
|
||||
|
||||
b zgemm_kernel_L2_M2_44
|
||||
|
||||
zgemm_kernel_L2_M2_32:
|
||||
|
||||
tst L, #1
|
||||
ble zgemm_kernel_L2_M2_40
|
||||
|
||||
KERNEL2x2_I
|
||||
KERNEL2x2_M2
|
||||
KERNEL2x2_M1
|
||||
KERNEL2x2_M2
|
||||
|
||||
KERNEL2x2_M1
|
||||
KERNEL2x2_M2
|
||||
KERNEL2x2_M1
|
||||
KERNEL2x2_E
|
||||
|
||||
b zgemm_kernel_L2_M2_44
|
||||
|
||||
zgemm_kernel_L2_M2_40:
|
||||
|
||||
INIT2x2
|
||||
|
||||
zgemm_kernel_L2_M2_44:
|
||||
|
||||
ands L , K, #7 // L = K % 8
|
||||
ble zgemm_kernel_L2_M2_100
|
||||
|
||||
zgemm_kernel_L2_M2_46:
|
||||
|
||||
KERNEL2x2_SUB
|
||||
|
||||
subs L, L, #1
|
||||
bne zgemm_kernel_L2_M2_46
|
||||
|
||||
zgemm_kernel_L2_M2_100:
|
||||
|
||||
SAVE2x2
|
||||
|
||||
zgemm_kernel_L999:
|
||||
|
||||
sub r3, fp, #128
|
||||
vldm r3, { d8 - d15} // restore floating point registers
|
||||
|
||||
sub sp, fp, #28
|
||||
pop {r4 - r9, fp}
|
||||
bx lr
|
||||
|
||||
108
config/armv7a/make_defs.mk
Normal file
108
config/armv7a/make_defs.mk
Normal file
@@ -0,0 +1,108 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# BLIS
|
||||
# An object-based framework for developing high-performance BLAS-like
|
||||
# libraries.
|
||||
#
|
||||
# Copyright (C) 2014, The University of Texas
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are
|
||||
# met:
|
||||
# - Redistributions of source code must retain the above copyright
|
||||
# notice, this list of conditions and the following disclaimer.
|
||||
# - Redistributions in binary form must reproduce the above copyright
|
||||
# notice, this list of conditions and the following disclaimer in the
|
||||
# documentation and/or other materials provided with the distribution.
|
||||
# - Neither the name of The University of Texas nor the names of its
|
||||
# contributors may be used to endorse or promote products derived
|
||||
# from this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
#
|
||||
|
||||
# Only include this block of code once.
|
||||
ifndef MAKE_DEFS_MK_INCLUDED
|
||||
MAKE_DEFS_MK_INCLUDED := yes
|
||||
|
||||
|
||||
|
||||
#
|
||||
# --- Build definitions --------------------------------------------------------
|
||||
#
|
||||
|
||||
# Variables corresponding to other configure-time options.
|
||||
BLIS_ENABLE_VERBOSE_MAKE_OUTPUT := yes
|
||||
BLIS_ENABLE_STATIC_BUILD := yes
|
||||
BLIS_ENABLE_DYNAMIC_BUILD := no
|
||||
|
||||
|
||||
|
||||
#
|
||||
# --- Utility program definitions ----------------------------------------------
|
||||
#
|
||||
|
||||
SH := /bin/sh
|
||||
MV := mv
|
||||
MKDIR := mkdir -p
|
||||
RM_F := rm -f
|
||||
RM_RF := rm -rf
|
||||
SYMLINK := ln -sf
|
||||
FIND := find
|
||||
GREP := grep
|
||||
XARGS := xargs
|
||||
RANLIB := ranlib
|
||||
INSTALL := install -c
|
||||
|
||||
# Used to refresh CHANGELOG.
|
||||
GIT := git
|
||||
GIT_LOG := $(GIT) log --decorate
|
||||
|
||||
|
||||
|
||||
#
|
||||
# --- Development tools definitions --------------------------------------------
|
||||
#
|
||||
|
||||
# --- Determine the C compiler and related flags ---
|
||||
CC := gcc
|
||||
# Enable IEEE Standard 1003.1-2004 (POSIX.1d).
|
||||
# NOTE: This is needed to enable posix_memalign().
|
||||
CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L
|
||||
CMISCFLAGS := -std=c99 -O3 -mfloat-abi=hard -mfpu=vfpv3 -marm -march=armv7-a #-g
|
||||
CDBGFLAGS := #-g
|
||||
CWARNFLAGS := -Wall
|
||||
COPTFLAGS := -marm -march=armv7-a -mfpu=vfpv3 -O3 -mfloat-abi=hard #-g
|
||||
CKOPTFLAGS := $(COPTFLAGS)
|
||||
CVECFLAGS := #-msse3 # -mfpmath=sse
|
||||
|
||||
# Aggregate all of the flags into multiple groups: one for standard
|
||||
# compilation, and one for each of the supported "special" compilation
|
||||
# modes.
|
||||
CFLAGS := $(CDBGFLAGS) $(COPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
|
||||
CFLAGS_KERNELS := $(CDBGFLAGS) $(CKOPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
|
||||
CFLAGS_NOOPT := $(CDBGFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
|
||||
|
||||
# --- Determine the archiver and related flags ---
|
||||
AR := ar
|
||||
ARFLAGS := cru
|
||||
|
||||
# --- Determine the linker and related flags ---
|
||||
LINKER := $(CC)
|
||||
LDFLAGS := -lm
|
||||
|
||||
|
||||
|
||||
# end of ifndef MAKE_DEFS_MK_INCLUDED conditional block
|
||||
endif
|
||||
Reference in New Issue
Block a user