diff --git a/config/armv7a/bli_config.h b/config/armv7a/bli_config.h new file mode 100644 index 000000000..e1c28991c --- /dev/null +++ b/config/armv7a/bli_config.h @@ -0,0 +1,169 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_CONFIG_H +#define BLIS_CONFIG_H + + +// -- OPERATING SYSTEM --------------------------------------------------------- + + + +// -- INTEGER PROPERTIES ------------------------------------------------------- + +// The bit size of the integer type used to track values such as dimensions, +// strides, diagonal offsets. A value of 32 results in BLIS using 32-bit signed +// integers while 64 results in 64-bit integers. Any other value results in use +// of the C99 type "long int". Note that this ONLY affects integers used +// internally within BLIS as well as those exposed in the native BLAS-like BLIS +// interface. +#define BLIS_INT_TYPE_SIZE 32 + + + +// -- FLOATING-POINT PROPERTIES ------------------------------------------------ + +// Define the number of floating-point types supported, and the size of the +// largest type. +#define BLIS_NUM_FP_TYPES 4 +#define BLIS_MAX_TYPE_SIZE sizeof(dcomplex) + +// Enable use of built-in C99 "float complex" and "double complex" types and +// associated overloaded operations and functions? Disabling results in +// scomplex and dcomplex being defined in terms of simple structs. +//#define BLIS_ENABLE_C99_COMPLEX + + + +// -- MULTITHREADING ----------------------------------------------------------- + +// The maximum number of BLIS threads that will run concurrently. +#define BLIS_MAX_NUM_THREADS 1 + + + +// -- MEMORY ALLOCATION -------------------------------------------------------- + +// -- Contiguous (static) memory allocator -- + +// The number of MC x KC, KC x NC, and MC x NC blocks to reserve in the +// contiguous memory pools. +#define BLIS_NUM_MC_X_KC_BLOCKS BLIS_MAX_NUM_THREADS +#define BLIS_NUM_KC_X_NC_BLOCKS BLIS_MAX_NUM_THREADS +#define BLIS_NUM_MC_X_NC_BLOCKS 0 + +// The maximum preload byte offset is used to pad the end of the contiguous +// memory pools so that the micro-kernel, when computing with the end of the +// last block, can exceed the bounds of the usable portion of the memory +// region without causing a segmentation fault. +#define BLIS_MAX_PRELOAD_BYTE_OFFSET 128 + +// -- Memory alignment -- + +// It is sometimes useful to define the various memory alignments in terms +// of some other characteristics of the system, such as the cache line size +// and the page size. +#define BLIS_CACHE_LINE_SIZE 32 +#define BLIS_PAGE_SIZE 4096 + +// Alignment size needed by the instruction set for aligned SIMD/vector +// instructions. +#define BLIS_SIMD_ALIGN_SIZE 32 + +// Alignment size used to align local stack buffers within macro-kernel +// functions. +#define BLIS_STACK_BUF_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE + +// Alignment size used when allocating memory dynamically from the operating +// system (eg: posix_memalign()). To disable heap alignment and just use +// malloc() instead, set this to 1. +#define BLIS_HEAP_ADDR_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE + +// Alignment size used when sizing leading dimensions of dynamically +// allocated memory. +#define BLIS_HEAP_STRIDE_ALIGN_SIZE BLIS_CACHE_LINE_SIZE + +// Alignment size used when allocating entire blocks of contiguous memory +// from the contiguous memory allocator. +#define BLIS_CONTIG_ADDR_ALIGN_SIZE BLIS_PAGE_SIZE + +// Alignment size used when sizing strides (eg: of packed micro-panels) +// within a block of contiguous memory. +#define BLIS_CONTIG_STRIDE_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE + + + +// -- MIXED DATATYPE SUPPORT --------------------------------------------------- + +// Basic (homogeneous) datatype support always enabled. + +// Enable mixed domain operations? +//#define BLIS_ENABLE_MIXED_DOMAIN_SUPPORT + +// Enable extra mixed precision operations? +//#define BLIS_ENABLE_MIXED_PRECISION_SUPPORT + + + +// -- MISCELLANEOUS OPTIONS ---------------------------------------------------- + +// Stay initialized after auto-initialization, unless and until the user +// explicitly calls bli_finalize(). +#define BLIS_ENABLE_STAY_AUTO_INITIALIZED + + + +// -- BLAS-to-BLIS COMPATIBILITY LAYER ----------------------------------------- + +// Enable the BLAS compatibility layer? +#define BLIS_ENABLE_BLAS2BLIS + +// The bit size of the integer type used to track values such as dimensions and +// leading dimensions (ie: column strides) within the BLAS compatibility layer. +// A value of 32 results in the compatibility layer using 32-bit signed integers +// while 64 results in 64-bit integers. Any other value results in use of the +// C99 type "long int". Note that this ONLY affects integers used within the +// BLAS compatibility layer. +#define BLIS_BLAS2BLIS_INT_TYPE_SIZE 32 + +// Fortran-77 name-mangling macros. +#define PASTEF770(name) name ## _ +#define PASTEF77(ch1,name) ch1 ## name ## _ +#define PASTEF772(ch1,ch2,name) ch1 ## ch2 ## name ## _ + + + + +#endif + diff --git a/config/armv7a/bli_kernel.h b/config/armv7a/bli_kernel.h new file mode 100644 index 000000000..0a1c356b4 --- /dev/null +++ b/config/armv7a/bli_kernel.h @@ -0,0 +1,348 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_KERNEL_H +#define BLIS_KERNEL_H + + +// -- LEVEL-3 MICRO-KERNEL CONSTANTS ------------------------------------------- + +// -- Default cache blocksizes -- + +// +// Constraints: +// +// (1) MC must be a multiple of: +// (a) MR (for zero-padding purposes) +// (b) NR (for zero-padding purposes when MR and NR are "swapped") +// (2) NC must be a multiple of +// (a) NR (for zero-padding purposes) +// (b) MR (for zero-padding purposes when MR and NR are "swapped") +// (3) KC must be a multiple of +// (a) MR and +// (b) NR (for triangular operations such as trmm and trsm). +// + +#define BLIS_DEFAULT_MC_S 432 +#define BLIS_DEFAULT_KC_S 352 +#define BLIS_DEFAULT_NC_S 4096 + +#define BLIS_DEFAULT_MC_D 192 +#define BLIS_DEFAULT_KC_D 256 +#define BLIS_DEFAULT_NC_D 4096 + +#define BLIS_DEFAULT_MC_C 64 +#define BLIS_DEFAULT_KC_C 128 +#define BLIS_DEFAULT_NC_C 4096 + +#define BLIS_DEFAULT_MC_Z 64 +#define BLIS_DEFAULT_KC_Z 128 +#define BLIS_DEFAULT_NC_Z 4096 + +// -- Cache blocksize extensions (for optimizing edge cases) -- + +// NOTE: These cache blocksize "extensions" have the same constraints as +// the corresponding default blocksizes above. When these values are +// non-zero, blocksizes used at edge cases are extended (enlarged) if +// such an extension would encompass the remaining portion of the +// matrix dimension. + +#define BLIS_EXTEND_MC_S 0 //(BLIS_DEFAULT_MC_S/4) +#define BLIS_EXTEND_KC_S 0 //(BLIS_DEFAULT_KC_S/4) +#define BLIS_EXTEND_NC_S 0 //(BLIS_DEFAULT_NC_S/4) + +#define BLIS_EXTEND_MC_D 0 //(BLIS_DEFAULT_MC_D/4) +#define BLIS_EXTEND_KC_D 0 //(BLIS_DEFAULT_KC_D/4) +#define BLIS_EXTEND_NC_D 0 //(BLIS_DEFAULT_NC_D/4) + +#define BLIS_EXTEND_MC_C 0 //(BLIS_DEFAULT_MC_C/4) +#define BLIS_EXTEND_KC_C 0 //(BLIS_DEFAULT_KC_C/4) +#define BLIS_EXTEND_NC_C 0 //(BLIS_DEFAULT_NC_C/4) + +#define BLIS_EXTEND_MC_Z 0 //(BLIS_DEFAULT_MC_Z/4) +#define BLIS_EXTEND_KC_Z 0 //(BLIS_DEFAULT_KC_Z/4) +#define BLIS_EXTEND_NC_Z 0 //(BLIS_DEFAULT_NC_Z/4) + +// -- Default register blocksizes for micro-kernel -- + +// NOTE: When using the reference configuration, these register blocksizes +// in the m and n dimensions should all be equal to the size expected by +// the reference micro-kernel(s). + +#define BLIS_DEFAULT_MR_S 4 +#define BLIS_DEFAULT_NR_S 4 + +#define BLIS_DEFAULT_MR_D 4 +#define BLIS_DEFAULT_NR_D 4 + +#define BLIS_DEFAULT_MR_C 2 +#define BLIS_DEFAULT_NR_C 2 + +#define BLIS_DEFAULT_MR_Z 2 +#define BLIS_DEFAULT_NR_Z 2 + +// NOTE: If the micro-kernel, which is typically unrolled to a factor +// of f, handles leftover edge cases (ie: when k % f > 0) then these +// register blocksizes in the k dimension can be defined to 1. + +#define BLIS_DEFAULT_KR_S 1 +#define BLIS_DEFAULT_KR_D 1 +#define BLIS_DEFAULT_KR_C 1 +#define BLIS_DEFAULT_KR_Z 1 + +// -- Register blocksize extensions (for packed micro-panels) -- + +// NOTE: These register blocksize "extensions" determine whether the +// leading dimensions used within the packed micro-panels are equal to +// or greater than their corresponding register blocksizes above. + +#define BLIS_EXTEND_MR_S 0 +#define BLIS_EXTEND_NR_S 0 + +#define BLIS_EXTEND_MR_D 0 +#define BLIS_EXTEND_NR_D 0 + +#define BLIS_EXTEND_MR_C 0 +#define BLIS_EXTEND_NR_C 0 + +#define BLIS_EXTEND_MR_Z 0 +#define BLIS_EXTEND_NR_Z 0 + +// Register blocksize extensions in the k dimension are not used. + +#define BLIS_EXTEND_KR_S 0 +#define BLIS_EXTEND_KR_D 0 +#define BLIS_EXTEND_KR_C 0 +#define BLIS_EXTEND_KR_Z 0 + +// -- Default incremental packing blocksizes (n dimension) -- + +// NOTE: These incremental packing blocksizes (for the n dimension) are only +// used by certain blocked variants. But when the *are* used, they MUST be +// be an integer multiple of NR! + +#define BLIS_DEFAULT_NI_FAC 16 +#define BLIS_DEFAULT_NI_S (BLIS_DEFAULT_NI_FAC * BLIS_DEFAULT_NR_S) +#define BLIS_DEFAULT_NI_D (BLIS_DEFAULT_NI_FAC * BLIS_DEFAULT_NR_D) +#define BLIS_DEFAULT_NI_C (BLIS_DEFAULT_NI_FAC * BLIS_DEFAULT_NR_C) +#define BLIS_DEFAULT_NI_Z (BLIS_DEFAULT_NI_FAC * BLIS_DEFAULT_NR_Z) + + + +// -- LEVEL-2 KERNEL CONSTANTS ------------------------------------------------- + +// NOTE: These values determine high-level cache blocking for level-2 +// operations ONLY. So, if gemv is performed with a 2000x2000 matrix A and +// MC = NC = 1000, then a total of four unblocked (or unblocked fused) +// gemv subproblems are called. The blocked algorithms are only useful in +// that they provide the opportunity for packing vectors. (Matrices can also +// be packed here, but this tends to be much too expensive in practice to +// actually employ.) + +#define BLIS_DEFAULT_L2_MC_S 1000 +#define BLIS_DEFAULT_L2_NC_S 1000 + +#define BLIS_DEFAULT_L2_MC_D 1000 +#define BLIS_DEFAULT_L2_NC_D 1000 + +#define BLIS_DEFAULT_L2_MC_C 1000 +#define BLIS_DEFAULT_L2_NC_C 1000 + +#define BLIS_DEFAULT_L2_MC_Z 1000 +#define BLIS_DEFAULT_L2_NC_Z 1000 + + + +// -- LEVEL-1F KERNEL CONSTANTS ------------------------------------------------ + +// -- Default fusing factors for level-1f operations -- + +// NOTE: Default fusing factors are not used by the reference implementations +// of level-1f operations. They are here only for use when these operations +// are optimized. + +#define BLIS_DEFAULT_FUSE_FAC_S 8 +#define BLIS_DEFAULT_FUSE_FAC_D 4 +#define BLIS_DEFAULT_FUSE_FAC_C 4 +#define BLIS_DEFAULT_FUSE_FAC_Z 2 + +#define BLIS_AXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S +#define BLIS_AXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D +#define BLIS_AXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C +#define BLIS_AXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z + +#define BLIS_DOTXF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S +#define BLIS_DOTXF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D +#define BLIS_DOTXF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C +#define BLIS_DOTXF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z + +#define BLIS_DOTXAXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S +#define BLIS_DOTXAXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D +#define BLIS_DOTXAXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C +#define BLIS_DOTXAXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z + + + +// -- LEVEL-1V KERNEL CONSTANTS ------------------------------------------------ + +// -- Default register blocksizes for vectors -- + +// NOTE: Register blocksizes for vectors are used when packing +// non-contiguous vectors. Similar to that of KR, they can +// typically be set to 1. + +#define BLIS_DEFAULT_VR_S 1 +#define BLIS_DEFAULT_VR_D 1 +#define BLIS_DEFAULT_VR_C 1 +#define BLIS_DEFAULT_VR_Z 1 + + + +// -- LEVEL-3 KERNEL DEFINITIONS ----------------------------------------------- + +// -- gemm -- + +#include "bli_gemm_opt_4x4.h" +#define GEMM_UKERNEL gemm_opt_4x4 + +// -- trsm-related -- + +#define GEMMTRSM_L_UKERNEL gemmtrsm_l_ref_mxn +#define GEMMTRSM_U_UKERNEL gemmtrsm_u_ref_mxn + +#define TRSM_L_UKERNEL trsm_l_ref_mxn +#define TRSM_U_UKERNEL trsm_u_ref_mxn + + + +// -- LEVEL-1M KERNEL DEFINITIONS ---------------------------------------------- + +// -- packm -- + +#define PACKM_2XK_KERNEL packm_ref_2xk +#define PACKM_4XK_KERNEL packm_ref_4xk +#define PACKM_6XK_KERNEL packm_ref_6xk +#define PACKM_8XK_KERNEL packm_ref_8xk +#define PACKM_10XK_KERNEL packm_ref_10xk +#define PACKM_12XK_KERNEL packm_ref_12xk +#define PACKM_14XK_KERNEL packm_ref_14xk +#define PACKM_16XK_KERNEL packm_ref_16xk + +// -- unpackm -- + +#define UNPACKM_2XK_KERNEL unpackm_ref_2xk +#define UNPACKM_4XK_KERNEL unpackm_ref_4xk +#define UNPACKM_6XK_KERNEL unpackm_ref_6xk +#define UNPACKM_8XK_KERNEL unpackm_ref_8xk +#define UNPACKM_10XK_KERNEL unpackm_ref_10xk +#define UNPACKM_12XK_KERNEL unpackm_ref_12xk +#define UNPACKM_14XK_KERNEL unpackm_ref_14xk +#define UNPACKM_16XK_KERNEL unpackm_ref_16xk + + + +// -- LEVEL-1F KERNEL DEFINITIONS ---------------------------------------------- + +// -- axpy2v -- + +#define AXPY2V_KERNEL axpy2v_unb_var1 + +// -- dotaxpyv -- + +#define DOTAXPYV_KERNEL dotaxpyv_unb_var1 + +// -- axpyf -- + +#define AXPYF_KERNEL axpyf_unb_var1 + +// -- dotxf -- + +#define DOTXF_KERNEL dotxf_unb_var1 + +// -- dotxaxpyf -- + +#define DOTXAXPYF_KERNEL dotxaxpyf_unb_var1 + + + +// -- LEVEL-1V KERNEL DEFINITIONS ---------------------------------------------- + +// -- addv -- + +#define ADDV_KERNEL addv_unb_var1 + +// -- axpyv -- + +#define AXPYV_KERNEL axpyv_unb_var1 + +// -- copyv -- + +#define COPYV_KERNEL copyv_unb_var1 + +// -- dotv -- + +#define DOTV_KERNEL dotv_unb_var1 + +// -- dotxv -- + +#define DOTXV_KERNEL dotxv_unb_var1 + +// -- invertv -- + +#define INVERTV_KERNEL invertv_unb_var1 + +// -- scal2v -- + +#define SCAL2V_KERNEL scal2v_unb_var1 + +// -- scalv -- + +#define SCALV_KERNEL scalv_unb_var1 + +// -- setv -- + +#define SETV_KERNEL setv_unb_var1 + +// -- subv -- + +#define SUBV_KERNEL subv_unb_var1 + +// -- swapv -- + +#define SWAPV_KERNEL swapv_unb_var1 + + + +#endif + diff --git a/config/armv7a/kernels/3/bli_cgemm_kernel_2x2.S b/config/armv7a/kernels/3/bli_cgemm_kernel_2x2.S new file mode 100644 index 000000000..fd2be6fab --- /dev/null +++ b/config/armv7a/kernels/3/bli_cgemm_kernel_2x2.S @@ -0,0 +1,502 @@ + +#define REALNAME bli_cgemm_kernel_2x2 + +#define STACKSIZE 256 + +#define K r0 +#define PTR_ALPHA r1 +#define OLD_A r2 +#define OLD_B r3 +#define PTR_BETA [fp, #0 ] +#define OLD_C [fp, #4 ] +#define OLD_RSC [fp, #8 ] +#define OLD_CSC [fp, #12 ] +#define AUX [fp, #16 ] + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* register +*******************************************************/ + +#define L r2 + +#define AO r5 +#define BO r6 + +#define CO1 r7 +#define CO2 r8 + + +#define A_PRE 96 +#define B_PRE 96 +#define C_PRE 0 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +#define FMAC_BR fnmacs +#define FMAC_BI fmacs + +#define NN 1 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + + #define FADD_R fsubs + #define FADD_I fadds + + #define FMAC_R1 fnmacs + #define FMAC_R2 fnmacs + #define FMAC_I1 fmacs + #define FMAC_I2 fnmacs + +#elif defined(CN) || defined(CT) + + #define FADD_R fadds + #define FADD_I fsubs + + #define FMAC_R1 fmacs + #define FMAC_R2 fmacs + #define FMAC_I1 fnmacs + #define FMAC_I2 fmacs + +#elif defined(NC) || defined(TC) + + #define FADD_R fadds + #define FADD_I fsubs + + #define FMAC_R1 fmacs + #define FMAC_R2 fnmacs + #define FMAC_I1 fmacs + #define FMAC_I2 fmacs + +#else + + #define FADD_R fsubs + #define FADD_I fadds + + #define FMAC_R1 fnmacs + #define FMAC_R2 fmacs + #define FMAC_I1 fnmacs + #define FMAC_I2 fnmacs + +#endif + + + +.macro INIT2x2 + + vsub.f32 s16 , s16 , s16 + vmov.f32 s17, s16 + vmov.f32 s18, s16 + vmov.f32 s19, s16 + vmov.f32 s20, s16 + vmov.f32 s21, s16 + vmov.f32 s22, s16 + vmov.f32 s23, s16 + vmov.f32 s24, s16 + vmov.f32 s25, s16 + vmov.f32 s26, s16 + vmov.f32 s27, s16 + vmov.f32 s28, s16 + vmov.f32 s29, s16 + vmov.f32 s30, s16 + vmov.f32 s31, s16 + +.endm + +.macro KERNEL2x2_I + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + + fmuls s16 , s0, s8 + flds s2 , [ AO, #8 ] + fmuls s24 , s1, s9 + flds s3 , [ AO, #12 ] + fmuls s17 , s0, s9 + flds s10, [ BO, #8 ] + fmuls s25 , s1, s8 + + flds s11, [ BO, #12 ] + fmuls s18 , s2, s8 + add BO , BO, #16 + fmuls s26 , s3, s9 + add AO , AO, #16 + fmuls s19 , s2, s9 + pld [ BO , #B_PRE ] + fmuls s27 , s3, s8 + + pld [ AO , #A_PRE ] + fmuls s20 , s0, s10 + flds s4 , [ AO, #0 ] + fmuls s28 , s1, s11 + flds s5 , [ AO, #4 ] + fmuls s21 , s0, s11 + flds s12, [ BO ] + fmuls s29 , s1, s10 + + flds s13, [ BO, #4 ] + fmuls s22 , s2, s10 + flds s6 , [ AO, #8 ] + fmuls s30 , s3, s11 + flds s7 , [ AO, #12 ] + fmuls s23 , s2, s11 + flds s14, [ BO, #8 ] + fmuls s31 , s3, s10 + flds s15, [ BO, #12 ] + + add BO , BO, #16 + add AO , AO, #16 +.endm + + + +.macro KERNEL2x2_M1 + pld [ AO , #A_PRE ] + + fmacs s16 , s0, s8 + pld [ BO , #B_PRE ] + fmacs s24 , s1, s9 + flds s4 , [ AO, #0 ] + fmacs s17 , s0, s9 + flds s5 , [ AO, #4 ] + fmacs s25 , s1, s8 + + flds s12, [ BO ] + fmacs s18 , s2, s8 + flds s13, [ BO, #4 ] + fmacs s26 , s3, s9 + flds s6 , [ AO, #8 ] + fmacs s19 , s2, s9 + flds s7 , [ AO, #12 ] + fmacs s27 , s3, s8 + + fmacs s20 , s0, s10 + flds s14, [ BO, #8 ] + fmacs s28 , s1, s11 + fmacs s21 , s0, s11 + flds s15, [ BO, #12 ] + fmacs s29 , s1, s10 + + fmacs s22 , s2, s10 + add BO , BO, #16 + fmacs s30 , s3, s11 + fmacs s23 , s2, s11 + add AO , AO, #16 + fmacs s31 , s3, s10 + +.endm + +.macro KERNEL2x2_M2 + + fmacs s16 , s4, s12 + fmacs s24 , s5, s13 + flds s0 , [ AO, #0 ] + fmacs s17 , s4, s13 + flds s1 , [ AO, #4 ] + fmacs s25 , s5, s12 + + fmacs s18 , s6, s12 + flds s8 , [ BO ] + fmacs s26 , s7, s13 + flds s9 , [ BO, #4 ] + fmacs s19 , s6, s13 + fmacs s27 , s7, s12 + + flds s2 , [ AO, #8 ] + fmacs s20 , s4, s14 + flds s3 , [ AO, #12 ] + fmacs s28 , s5, s15 + fmacs s21 , s4, s15 + flds s10, [ BO, #8 ] + fmacs s29 , s5, s14 + + flds s11, [ BO, #12 ] + fmacs s22 , s6, s14 + fmacs s30 , s7, s15 + add BO , BO, #16 + fmacs s23 , s6, s15 + add AO , AO, #16 + fmacs s31 , s7, s14 + +.endm + + +.macro KERNEL2x2_E + + fmacs s16 , s4, s12 + fmacs s24 , s5, s13 + fmacs s17 , s4, s13 + fmacs s25 , s5, s12 + + fmacs s18 , s6, s12 + fmacs s26 , s7, s13 + fmacs s19 , s6, s13 + fmacs s27 , s7, s12 + + fmacs s20 , s4, s14 + fmacs s28 , s5, s15 + fmacs s21 , s4, s15 + fmacs s29 , s5, s14 + + fmacs s22 , s6, s14 + fmacs s30 , s7, s15 + fmacs s23 , s6, s15 + fmacs s31 , s7, s14 + +.endm + +.macro KERNEL2x2_SUB + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + + fmacs s16 , s0, s8 + flds s2 , [ AO, #8 ] + fmacs s24 , s1, s9 + flds s3 , [ AO, #12 ] + fmacs s17 , s0, s9 + flds s10, [ BO, #8 ] + fmacs s25 , s1, s8 + + flds s11, [ BO, #12 ] + fmacs s18 , s2, s8 + fmacs s26 , s3, s9 + fmacs s19 , s2, s9 + fmacs s27 , s3, s8 + + fmacs s20 , s0, s10 + fmacs s28 , s1, s11 + fmacs s21 , s0, s11 + fmacs s29 , s1, s10 + + fmacs s22 , s2, s10 + add BO , BO, #16 + fmacs s30 , s3, s11 + fmacs s23 , s2, s11 + add AO , AO, #16 + fmacs s31 , s3, s10 + +.endm + + + + +.macro SAVE2x2 + + ldr r3, OLD_RSC // Row stride size + lsl r3, r3, #3 // multiply with size of complex float + + flds s0, [ PTR_ALPHA ] // load real part of alpha + flds s1, [ PTR_ALPHA, #4 ] // load imag part of alpha + ldr r4, PTR_BETA + flds s2, [ r4 ] // load real part of beta + flds s3, [ r4, #4 ] // load imag part of beta + + // Add/Sub the real and the imag parts + FADD_R s16, s24 , s16 + FADD_I s17, s25 , s17 + FADD_R s18, s26 , s18 + FADD_I s19, s27 , s19 + FADD_R s20, s28 , s20 + FADD_I s21, s29 , s21 + FADD_R s22, s30 , s22 + FADD_I s23, s31 , s23 + + mov r4, CO1 // save pointer + fldmias CO1, { s4 - s5 } // read real and imag part from C + add CO1, CO1, r3 + + mov r2, CO2 // save pointer + fldmias CO2, { s8 - s9 } // read real and imag part from C + add CO2, CO2, r3 + + fmuls s24, s4, s2 // multiply Beta-real with C-real + fmuls s25, s5, s2 // multiply Beta-real with C-imag + fmuls s28, s8, s2 // multiply Beta-real with C-real + fmuls s29, s9, s2 // multiply Beta-real with C-imag + + FMAC_BR s24, s3, s5 // multiply beta-imag with C-imag and add + FMAC_BI s25, s3, s4 // multiply beta-imag with C-real and add + FMAC_BR s28, s3, s9 // multiply beta-imag with C-imag and add + FMAC_BI s29, s3, s8 // multiply beta-imag with C-real and add + + FMAC_R1 s24 , s0 , s16 + FMAC_I1 s25 , s0 , s17 + FMAC_R2 s24 , s1 , s17 + FMAC_I2 s25 , s1 , s16 + + FMAC_R1 s28 , s0 , s20 + FMAC_I1 s29 , s0 , s21 + FMAC_R2 s28 , s1 , s21 + FMAC_I2 s29 , s1 , s20 + + fldmias CO1, { s4 - s5 } // read real and imag part from C + fldmias CO2, { s8 - s9 } // read real and imag part from C + + fmuls s26, s4, s2 // multiply Beta-real with C-real + fmuls s27, s5, s2 // multiply Beta-real with C-imag + fmuls s30, s8, s2 // multiply Beta-real with C-real + fmuls s31, s9, s2 // multiply Beta-real with C-imag + + FMAC_BR s26, s3, s5 // multiply beta-imag with C-imag and add + FMAC_BI s27, s3, s4 // multiply beta-imag with C-real and add + FMAC_BR s30, s3, s9 // multiply beta-imag with C-imag and add + FMAC_BI s31, s3, s8 // multiply beta-imag with C-real and add + + FMAC_R1 s26 , s0 , s18 + FMAC_I1 s27 , s0 , s19 + FMAC_R2 s26 , s1 , s19 + FMAC_I2 s27 , s1 , s18 + + FMAC_R1 s30, s0 , s22 + FMAC_I1 s31, s0 , s23 + FMAC_R2 s30, s1 , s23 + FMAC_I2 s31, s1 , s22 + + mov CO1, r4 // restore pointer + mov CO2, r2 // restore pointer + fstmias CO1, { s24 - s25 } + fstmias CO2, { s28 - s29 } + add CO1, CO1, r3 + add CO2, CO2, r3 + fstmias CO1, { s26 - s27 } + fstmias CO2, { s30 - s31 } + + +.endm + + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + .arm + .global REALNAME + .func REALNAME + +REALNAME: + + push {r4 - r9, fp} // save register + add fp, sp, #28 // add number of saved register multiplied by size of int + sub sp, sp, #STACKSIZE // reserve stack + + mov AO, OLD_A // pointer matrix A + mov BO, OLD_B // pointer matrix B + + sub r3, fp, #128 + vstm r3, { s8 - s31} // store floating point registers + + ldr r2, OLD_C // pointer matrix C + ldr r3, OLD_CSC // Col stride size of C + lsl r3, r3, #3 // multiply with size of complex float + + mov CO1, r2 // first line of C + add CO2, CO1, r3 // second line of C + + pld [ CO1, #C_PRE ] // prefetch the lines of C + pld [ CO2, #C_PRE ] // prefetch the lines of C + +cgemm_kernel_L2_M2_20: + + asrs L , K, #3 // L = K / 8 + cmp L , #2 + blt cgemm_kernel_L2_M2_32 + + KERNEL2x2_I + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + subs L, L, #2 + ble cgemm_kernel_L2_M2_22a + .align 5 + +cgemm_kernel_L2_M2_22: + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + subs L, L, #1 + bgt cgemm_kernel_L2_M2_22 + +cgemm_kernel_L2_M2_22a: + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_E + + b cgemm_kernel_L2_M2_44 + +cgemm_kernel_L2_M2_32: + + tst L, #1 + ble cgemm_kernel_L2_M2_40 + + KERNEL2x2_I + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_E + + b cgemm_kernel_L2_M2_44 + +cgemm_kernel_L2_M2_40: + + INIT2x2 + +cgemm_kernel_L2_M2_44: + + ands L , K, #7 // L = K % 8 + ble cgemm_kernel_L2_M2_100 + +cgemm_kernel_L2_M2_46: + + KERNEL2x2_SUB + + subs L, L, #1 + bne cgemm_kernel_L2_M2_46 + +cgemm_kernel_L2_M2_100: + + SAVE2x2 + +cgemm_kernel_L999: + + sub r3, fp, #128 + vldm r3, { s8 - s31} // restore floating point registers + + sub sp, fp, #28 + pop {r4 - r9, fp} + bx lr + diff --git a/config/armv7a/kernels/3/bli_dgemm_kernel_4x4.S b/config/armv7a/kernels/3/bli_dgemm_kernel_4x4.S new file mode 100644 index 000000000..fc0282846 --- /dev/null +++ b/config/armv7a/kernels/3/bli_dgemm_kernel_4x4.S @@ -0,0 +1,503 @@ + +#define REALNAME bli_dgemm_kernel_4x4 + +#define STACKSIZE 256 + +#define K r0 +#define PTR_ALPHA r1 +#define OLD_A r2 +#define OLD_B r3 +#define PTR_BETA [fp, #0 ] +#define OLD_C [fp, #4 ] +#define OLD_RSC [fp, #8 ] +#define OLD_CSC [fp, #12 ] +#define AUX [fp, #16 ] + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* register +*******************************************************/ + +#define L r2 + +#define AO r5 +#define BO r6 + +#define CO1 r7 +#define CO2 r8 +#define CO3 r9 +#define CO4 r12 + + +#define A_PRE 96 +#define B_PRE 96 +#define C_PRE 0 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro INIT4x4 + + vsub.f64 d16 , d16 , d16 + vmov.f64 d17, d16 + vmov.f64 d18, d16 + vmov.f64 d19, d16 + vmov.f64 d20, d16 + vmov.f64 d21, d16 + vmov.f64 d22, d16 + vmov.f64 d23, d16 + vmov.f64 d24, d16 + vmov.f64 d25, d16 + vmov.f64 d26, d16 + vmov.f64 d27, d16 + vmov.f64 d28, d16 + vmov.f64 d29, d16 + vmov.f64 d30, d16 + vmov.f64 d31, d16 + +.endm + +.macro KERNEL4x4_I + pld [ BO , #B_PRE ] + fldd d8 , [ BO ] + fldd d0 , [ AO ] + pld [ AO , #A_PRE ] + + fldd d1 , [ AO, #8 ] + fmuld d16 , d0, d8 + fldd d2 , [ AO, #16 ] + fmuld d17 , d1, d8 + fldd d3 , [ AO, #24 ] + fmuld d18 , d2, d8 + fldd d9 , [ BO, #8 ] + fmuld d19 , d3, d8 + + fldd d10, [ BO, #16 ] + fmuld d20 , d0, d9 + fldd d11, [ BO, #24 ] + fmuld d21 , d1, d9 + add BO , BO, #32 + add AO , AO, #32 + fmuld d22 , d2, d9 + + pld [ BO , #B_PRE ] + fldd d12, [ BO ] + fmuld d23 , d3, d9 + + pld [ AO , #A_PRE ] + fldd d4 , [ AO, #0 ] + fmuld d24 , d0, d10 + fldd d5 , [ AO, #8 ] + fmuld d25 , d1, d10 + fldd d6 , [ AO, #16 ] + fmuld d26 , d2, d10 + fldd d7 , [ AO, #24 ] + fmuld d27 , d3, d10 + + fldd d13, [ BO, #8 ] + fmuld d28 , d0, d11 + fldd d14, [ BO, #16 ] + fmuld d29 , d1, d11 + fldd d15, [ BO, #24 ] + fmuld d30 , d2, d11 + fmuld d31 , d3, d11 + +.endm + +.macro KERNEL4x4_M2 + + fmacd d16 , d4, d12 + pld [ AO , #A_PRE+32 ] + fmacd d17 , d5, d12 + fldd d0 , [ AO , #32 ] + fmacd d18 , d6, d12 + pld [ BO , #B_PRE+32 ] + fmacd d19 , d7, d12 + + fldd d8 , [ BO , #32 ] + fmacd d20 , d4, d13 + fldd d1 , [ AO, #40 ] + fmacd d21 , d5, d13 + fldd d2 , [ AO, #48 ] + fmacd d22 , d6, d13 + fldd d3 , [ AO, #56 ] + fmacd d23 , d7, d13 + + fmacd d24 , d4, d14 + fmacd d25 , d5, d14 + fldd d9 , [ BO, #40 ] + fmacd d26 , d6, d14 + fldd d10, [ BO, #48 ] + fmacd d27 , d7, d14 + + fldd d11, [ BO, #56 ] + fmacd d28 , d4, d15 + fmacd d29 , d5, d15 + add AO , AO, #64 + fmacd d30 , d6, d15 + add BO , BO, #64 + fmacd d31 , d7, d15 + +.endm + +.macro KERNEL4x4_M1 + + fmacd d16 , d0, d8 + pld [ AO , #A_PRE ] + fmacd d17 , d1, d8 + fldd d4 , [ AO ] + fmacd d18 , d2, d8 + pld [ BO , #B_PRE ] + fmacd d19 , d3, d8 + + fldd d12, [ BO ] + fmacd d20 , d0, d9 + fldd d5 , [ AO, #8 ] + fmacd d21 , d1, d9 + fldd d6 , [ AO, #16 ] + fmacd d22 , d2, d9 + fldd d7 , [ AO, #24 ] + fmacd d23 , d3, d9 + + fmacd d24 , d0, d10 + fmacd d25 , d1, d10 + fldd d13, [ BO, #8 ] + fmacd d26 , d2, d10 + fldd d14, [ BO, #16 ] + fmacd d27 , d3, d10 + + fldd d15, [ BO, #24 ] + fmacd d28 , d0, d11 + fmacd d29 , d1, d11 + fmacd d30 , d2, d11 + fmacd d31 , d3, d11 + +.endm + +.macro KERNEL4x4_E + + fmacd d16 , d4, d12 + fmacd d17 , d5, d12 + add BO , BO, #32 + fmacd d18 , d6, d12 + add AO , AO, #32 + fmacd d19 , d7, d12 + + fmacd d20 , d4, d13 + fmacd d21 , d5, d13 + fmacd d22 , d6, d13 + fmacd d23 , d7, d13 + + fmacd d24 , d4, d14 + fmacd d25 , d5, d14 + fmacd d26 , d6, d14 + fmacd d27 , d7, d14 + + fmacd d28 , d4, d15 + fmacd d29 , d5, d15 + fmacd d30 , d6, d15 + fmacd d31 , d7, d15 + +.endm + +.macro KERNEL4x4_SUB + + fldd d8 , [ BO ] + pld [ BO , #B_PRE ] + + fldd d0 , [ AO ] + pld [ AO , #A_PRE ] + fldd d1 , [ AO, #8 ] + + fmacd d16 , d0, d8 + fldd d2 , [ AO, #16 ] + fmacd d17 , d1, d8 + fldd d3 , [ AO, #24 ] + fmacd d18 , d2, d8 + fldd d9 , [ BO, #8 ] + fmacd d19 , d3, d8 + + fldd d10, [ BO, #16 ] + fmacd d20 , d0, d9 + fldd d11, [ BO, #24 ] + fmacd d21 , d1, d9 + fmacd d22 , d2, d9 + fmacd d23 , d3, d9 + + fmacd d24 , d0, d10 + fmacd d25 , d1, d10 + fmacd d26 , d2, d10 + fmacd d27 , d3, d10 + + fmacd d28 , d0, d11 + fmacd d29 , d1, d11 + add AO , AO, #32 + fmacd d30 , d2, d11 + add BO , BO, #32 + fmacd d31 , d3, d11 + +.endm + +.macro SAVE4x4 + + ldr r3, OLD_RSC // Row stride size + lsl r3, r3, #3 // multiply with size of double + + fldd d0, [ PTR_ALPHA ] // load alpha + ldr r4, PTR_BETA + fldd d1, [ r4 ] // load beta + +//----------------------------------------------------------- + mov r2, CO1 // save pointer + mov r4, CO2 // save pointer + fldd d8, [ CO1 ] // load value from C + fldd d12, [ CO2 ] // load value from C + fmuld d8, d8, d1 // multiply with beta + add CO1, CO1, r3 // compute next pointer + fmacd d8, d0, d16 // multiply sum with alpha and add to value of C + add CO2, CO2, r3 // compute next pointer + + fldd d9, [ CO1 ] // load value from C + fldd d13, [ CO2 ] // load value from C + fmuld d9, d9, d1 // multiply with beta + add CO1, CO1, r3 // compute next pointer + fmacd d9, d0, d17 // multiply sum with alpha and add to value of C + add CO2, CO2, r3 // compute next pointer + + fldd d10, [ CO1 ] // load value from C + fldd d14, [ CO2 ] // load value from C + fmuld d10, d10, d1 // multiply with beta + add CO1, CO1, r3 // compute next pointer + fmacd d10, d0, d18 // multiply sum with alpha and add to value of C + add CO2, CO2, r3 // compute next pointer + + fldd d11, [ CO1 ] // load value from C + fldd d15, [ CO2 ] // load value from C + fmuld d11, d11, d1 // multiply with beta + mov CO1, r2 // restore pointer + fmacd d11, d0, d19 // multiply sum with alpha and add to value of C + mov CO2, r4 // restore pointer + + fstd d8, [ CO1 ] // store value in C + add CO1 , CO1, r3 // compute next pointer + fstd d9, [ CO1 ] // store value in C + add CO1 , CO1, r3 // compute next pointer + fstd d10, [ CO1 ] // store value in C + add CO1 , CO1, r3 // compute next pointer + fstd d11, [ CO1 ] // store value in C + +//----------------------------------------------------------- + mov r2, CO3 // save pointer + fldd d8, [ CO3 ] // load value from C + fmuld d12, d12, d1 // multiply with beta + add CO3, CO3, r3 // compute next pointer + fmacd d12, d0, d20 // multiply sum with alpha and add to value of C + + fldd d9, [ CO3 ] // load value from C + fmuld d13, d13, d1 // multiply with beta + add CO3, CO3, r3 // compute next pointer + fmacd d13, d0, d21 // multiply sum with alpha and add to value of C + + fldd d10, [ CO3 ] // load value from C + fmuld d14, d14, d1 // multiply with beta + add CO3, CO3, r3 // compute next pointer + fmacd d14, d0, d22 // multiply sum with alpha and add to value of C + + fldd d11, [ CO3 ] // load value from C + fmuld d15, d15, d1 // multiply with beta + mov CO3, r2 // restore pointer + fmacd d15, d0, d23 // multiply sum with alpha and add to value of C + + fstd d12, [ CO2 ] // store value in C + add CO2 , CO2, r3 // compute next pointer + fstd d13, [ CO2 ] // store value in C + add CO2 , CO2, r3 // compute next pointer + fstd d14, [ CO2 ] // store value in C + add CO2 , CO2, r3 // compute next pointer + fstd d15, [ CO2 ] // store value in C + +//----------------------------------------------------------- + mov r4, CO4 // save pointer + fldd d12, [ CO4 ] // load value from C + fmuld d8, d8, d1 // multiply with beta + add CO4, CO4, r3 // compute next pointer + fmacd d8, d0, d24 // multiply sum with alpha and add to value of C + + fldd d13, [ CO4 ] // load value from C + fmuld d9, d9, d1 // multiply with beta + add CO4, CO4, r3 // compute next pointer + fmacd d9, d0, d25 // multiply sum with alpha and add to value of C + + fldd d14, [ CO4 ] // load value from C + fmuld d10, d10, d1 // multiply with beta + add CO4, CO4, r3 // compute next pointer + fmacd d10, d0, d26 // multiply sum with alpha and add to value of C + + fldd d15, [ CO4 ] // load value from C + fmuld d11, d11, d1 // multiply with beta + mov CO4, r4 // restore pointer + fmacd d11, d0, d27 // multiply sum with alpha and add to value of C + + +//----------------------------------------------------------- + fstd d8, [ CO3 ] // store value in C + fmuld d12, d12, d1 // multiply with beta + add CO3 , CO3, r3 // compute next pointer + fmacd d12, d0, d28 // multiply sum with alpha and add to value of C + + fstd d9, [ CO3 ] // store value in C + fmuld d13, d13, d1 // multiply with beta + add CO3 , CO3, r3 // compute next pointer + fmacd d13, d0, d29 // multiply sum with alpha and add to value of C + + fstd d10, [ CO3 ] // store value in C + fmuld d14, d14, d1 // multiply with beta + add CO3 , CO3, r3 // compute next pointer + fmacd d14, d0, d30 // multiply sum with alpha and add to value of C + + fstd d11, [ CO3 ] // store value in C + fmuld d15, d15, d1 // multiply with beta + fstd d12, [ CO4 ] // store value in C + fmacd d15, d0, d31 // multiply sum with alpha and add to value of C + + add CO4 , CO4, r3 // compute next pointer + fstd d13, [ CO4 ] // store value in C + add CO4 , CO4, r3 // compute next pointer + fstd d14, [ CO4 ] // store value in C + add CO4 , CO4, r3 // compute next pointer + fstd d15, [ CO4 ] // store value in C + +.endm + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + .arm + .global REALNAME + .func REALNAME + +REALNAME: + + push {r4 - r9, fp} // save register + add fp, sp, #28 // add number of saved register multiplied by size of int + sub sp, sp, #STACKSIZE // reserve stack + + mov AO, OLD_A // pointer matrix A + mov BO, OLD_B // pointer matrix B + + sub r3, fp, #128 + vstm r3, { d8 - d15} // store floating point registers + + ldr r2, OLD_C // pointer matrix C + ldr r3, OLD_CSC // Col stride size of C + lsl r3, r3, #3 // multiply with size of double + + mov CO1, r2 // first line of C + add CO2, CO1, r3 // second line of C + add CO3, CO2, r3 // third line of C + add CO4, CO3, r3 // fourth line of C + + pld [ CO1, #C_PRE ] // prefetch the lines of C + pld [ CO2, #C_PRE ] // prefetch the lines of C + pld [ CO3, #C_PRE ] // prefetch the lines of C + pld [ CO3, #C_PRE ] // prefetch the lines of C + +dgemm_kernel_L4_M4_20: + + asrs L , K, #3 // L = K / 8 + cmp L , #2 + blt dgemm_kernel_L4_M4_32 + + KERNEL4x4_I + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + subs L, L, #2 + ble dgemm_kernel_L4_M4_22a + .align 5 + +dgemm_kernel_L4_M4_22: + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + subs L, L, #1 + bgt dgemm_kernel_L4_M4_22 + +dgemm_kernel_L4_M4_22a: + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_E + + b dgemm_kernel_L4_M4_44 + +dgemm_kernel_L4_M4_32: + + tst L, #1 + ble dgemm_kernel_L4_M4_40 + + KERNEL4x4_I + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_E + + b dgemm_kernel_L4_M4_44 + +dgemm_kernel_L4_M4_40: + + INIT4x4 + +dgemm_kernel_L4_M4_44: + + ands L , K, #7 // L = K % 8 + ble dgemm_kernel_L4_M4_100 + +dgemm_kernel_L4_M4_46: + + KERNEL4x4_SUB + + subs L, L, #1 + bne dgemm_kernel_L4_M4_46 + +dgemm_kernel_L4_M4_100: + + SAVE4x4 + +dgemm_kernel_L999: + + sub r3, fp, #128 + vldm r3, { d8 - d15} // restore floating point registers + + sub sp, fp, #28 + pop {r4 - r9, fp} + bx lr + diff --git a/config/armv7a/kernels/3/bli_gemm_opt_4x4.c b/config/armv7a/kernels/3/bli_gemm_opt_4x4.c new file mode 100644 index 000000000..e1ea2d309 --- /dev/null +++ b/config/armv7a/kernels/3/bli_gemm_opt_4x4.c @@ -0,0 +1,134 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2012, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +extern void bli_sgemm_kernel_4x4(dim_t k, + float* alpha, + float* restrict a, + float* restrict b, + float* beta, + float* restrict c, inc_t rs_c, inc_t cs_c, + auxinfo_t* data + ); + + +void bli_sgemm_opt_4x4( + dim_t k, + float* restrict alpha, + float* restrict a, + float* restrict b, + float* restrict beta, + float* restrict c, inc_t rs_c, inc_t cs_c, + auxinfo_t* data + ) +{ + + bli_sgemm_kernel_4x4(k, alpha, a, b, beta, c, rs_c, cs_c, data); + +} + +extern void bli_dgemm_kernel_4x4(dim_t k, + double* alpha, + double* restrict a, + double* restrict b, + double* beta, + double* restrict c, inc_t rs_c, inc_t cs_c, + auxinfo_t* data + ); + + +void bli_dgemm_opt_4x4( + dim_t k, + double* restrict alpha, + double* restrict a, + double* restrict b, + double* restrict beta, + double* restrict c, inc_t rs_c, inc_t cs_c, + auxinfo_t* data + ) +{ + bli_dgemm_kernel_4x4(k, alpha, a, b, beta, c, rs_c, cs_c, data); +} + +extern void bli_cgemm_kernel_2x2(dim_t k, + scomplex* alpha, + scomplex* restrict a, + scomplex* restrict b, + scomplex* beta, + scomplex* restrict c, inc_t rs_c, inc_t cs_c, + auxinfo_t* data + ); + + + + +void bli_cgemm_opt_4x4( + dim_t k, + scomplex* restrict alpha, + scomplex* restrict a, + scomplex* restrict b, + scomplex* restrict beta, + scomplex* restrict c, inc_t rs_c, inc_t cs_c, + auxinfo_t* data + ) +{ + + bli_cgemm_kernel_2x2(k, alpha, a, b, beta, c, rs_c, cs_c, data); +} + +extern void bli_zgemm_kernel_2x2(dim_t k, + dcomplex* alpha, + dcomplex* restrict a, + dcomplex* restrict b, + dcomplex* beta, + dcomplex* restrict c, inc_t rs_c, inc_t cs_c, + auxinfo_t* data + ); + + +void bli_zgemm_opt_4x4( + dim_t k, + dcomplex* restrict alpha, + dcomplex* restrict a, + dcomplex* restrict b, + dcomplex* restrict beta, + dcomplex* restrict c, inc_t rs_c, inc_t cs_c, + auxinfo_t* data + ) +{ + + bli_zgemm_kernel_2x2(k, alpha, a, b, beta, c, rs_c, cs_c, data); +} + diff --git a/config/armv7a/kernels/3/bli_gemm_opt_4x4.h b/config/armv7a/kernels/3/bli_gemm_opt_4x4.h new file mode 100644 index 000000000..aa792c061 --- /dev/null +++ b/config/armv7a/kernels/3/bli_gemm_opt_4x4.h @@ -0,0 +1,53 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" +// #include "arm_neon.h" + + +#undef GENTPROT +#define GENTPROT( ctype, ch, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + dim_t k, \ + ctype* restrict alpha, \ + ctype* restrict a, \ + ctype* restrict b, \ + ctype* restrict beta, \ + ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + auxinfo_t* data \ + ); + +INSERT_GENTPROT_BASIC( gemm_opt_4x4 ) + diff --git a/config/armv7a/kernels/3/bli_sgemm_kernel_4x4.S b/config/armv7a/kernels/3/bli_sgemm_kernel_4x4.S new file mode 100644 index 000000000..0cbc30b83 --- /dev/null +++ b/config/armv7a/kernels/3/bli_sgemm_kernel_4x4.S @@ -0,0 +1,483 @@ + +#define REALNAME bli_sgemm_kernel_4x4 + +#define STACKSIZE 256 + +#define K r0 +#define PTR_ALPHA r1 +#define OLD_A r2 +#define OLD_B r3 +#define PTR_BETA [fp, #0 ] +#define OLD_C [fp, #4 ] +#define OLD_RSC [fp, #8 ] +#define OLD_CSC [fp, #12 ] +#define AUX [fp, #16 ] + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* register +*******************************************************/ + +#define L r2 + +#define AO r5 +#define BO r6 + +#define CO1 r7 +#define CO2 r8 +#define CO3 r9 +#define CO4 r12 + + +#define A_PRE 96 +#define B_PRE 96 +#define C_PRE 0 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro INIT4x4 + + vsub.f32 s16 , s16 , s16 + vmov.f32 s17, s16 + vmov.f32 s18, s16 + vmov.f32 s19, s16 + vmov.f32 s20, s16 + vmov.f32 s21, s16 + vmov.f32 s22, s16 + vmov.f32 s23, s16 + vmov.f32 s24, s16 + vmov.f32 s25, s16 + vmov.f32 s26, s16 + vmov.f32 s27, s16 + vmov.f32 s28, s16 + vmov.f32 s29, s16 + vmov.f32 s30, s16 + vmov.f32 s31, s16 + +.endm + +.macro KERNEL4x4_I + + pld [ AO , #A_PRE ] + fldmias AO!, { s0 - s1 } + pld [ BO , #B_PRE ] + fldmias BO!, { s8 - s9 } + + fmuls s16 , s0, s8 + fldmias AO!, { s2 - s3 } + fmuls s17 , s1, s8 + fmuls s18 , s2, s8 + fldmias BO!, { s10 - s11 } + fmuls s19 , s3, s8 + + fmuls s20 , s0, s9 + fldmias AO!, { s4 - s5 } + fmuls s21 , s1, s9 + fmuls s22 , s2, s9 + fldmias AO!, { s6 - s7 } + fmuls s23 , s3, s9 + + fmuls s24 , s0, s10 + fldmias BO!, { s12 - s13 } + fmuls s25 , s1, s10 + fmuls s26 , s2, s10 + fldmias BO!, { s14 - s15 } + fmuls s27 , s3, s10 + + fmuls s28 , s0, s11 + fmuls s29 , s1, s11 + fmuls s30 , s2, s11 + fmuls s31 , s3, s11 + +.endm + + +.macro KERNEL4x4_M2 + + pld [ AO , #A_PRE ] + fmacs s16 , s4, s12 + fmacs s17 , s5, s12 + fldmias AO!, { s0 - s3 } + fmacs s18 , s6, s12 + pld [ BO , #B_PRE ] + fmacs s19 , s7, s12 + + fmacs s20 , s4, s13 + fldmias BO!, { s8 - s11 } + fmacs s21 , s5, s13 + fmacs s22 , s6, s13 + //fldmias AO!, { s2 - s3 } + fmacs s23 , s7, s13 + + fmacs s24 , s4, s14 + //fldmias BO!, { s10 - s11 } + fmacs s25 , s5, s14 + fmacs s26 , s6, s14 + fmacs s27 , s7, s14 + + fmacs s28 , s4, s15 + fmacs s29 , s5, s15 + fmacs s30 , s6, s15 + fmacs s31 , s7, s15 + +.endm + + +.macro KERNEL4x4_M1 + + fmacs s16 , s0, s8 + fldmias AO!, { s4 - s7 } + fmacs s17 , s1, s8 + fmacs s18 , s2, s8 + fldmias BO!, { s12 - s15 } + //fldmias AO!, { s6 - s7 } + fmacs s19 , s3, s8 + + fmacs s20 , s0, s9 + fmacs s21 , s1, s9 + fmacs s22 , s2, s9 + //fldmias BO!, { s14 - s15 } + fmacs s23 , s3, s9 + + fmacs s24 , s0, s10 + fmacs s25 , s1, s10 + fmacs s26 , s2, s10 + fmacs s27 , s3, s10 + + fmacs s28 , s0, s11 + fmacs s29 , s1, s11 + fmacs s30 , s2, s11 + fmacs s31 , s3, s11 + +.endm + + + +.macro KERNEL4x4_E + + fmacs s16 , s4, s12 + fmacs s17 , s5, s12 + fmacs s18 , s6, s12 + fmacs s19 , s7, s12 + + fmacs s20 , s4, s13 + fmacs s21 , s5, s13 + fmacs s22 , s6, s13 + fmacs s23 , s7, s13 + + fmacs s24 , s4, s14 + fmacs s25 , s5, s14 + fmacs s26 , s6, s14 + fmacs s27 , s7, s14 + + fmacs s28 , s4, s15 + fmacs s29 , s5, s15 + fmacs s30 , s6, s15 + fmacs s31 , s7, s15 + +.endm + + + + +.macro KERNEL4x4_SUB + + flds s8 , [ BO ] + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + + fmacs s16 , s0, s8 + flds s2 , [ AO, #8 ] + fmacs s17 , s1, s8 + flds s3 , [ AO, #12 ] + fmacs s18 , s2, s8 + flds s9 , [ BO, #4 ] + fmacs s19 , s3, s8 + + flds s10, [ BO, #8 ] + fmacs s20 , s0, s9 + flds s11, [ BO, #12 ] + fmacs s21 , s1, s9 + fmacs s22 , s2, s9 + fmacs s23 , s3, s9 + + fmacs s24 , s0, s10 + fmacs s25 , s1, s10 + fmacs s26 , s2, s10 + fmacs s27 , s3, s10 + + fmacs s28 , s0, s11 + fmacs s29 , s1, s11 + add AO , AO, #16 + fmacs s30 , s2, s11 + add BO , BO, #16 + fmacs s31 , s3, s11 + +.endm + + +.macro SAVE4x4 + + ldr r3, OLD_RSC // Row stride size + lsl r3, r3, #2 // multiply with size of float + + flds s0, [ PTR_ALPHA ] // load alpha + ldr r4, PTR_BETA + flds s1, [ r4 ] // load beta + +//----------------------------------------------------------- + mov r2, CO1 // save pointer + mov r4, CO2 // save pointer + flds s8, [ CO1 ] // load value from C + flds s12, [ CO2 ] // load value from C + fmuls s8, s8, s1 // multiply with beta + add CO1, CO1, r3 // compute next pointer + fmacs s8, s0, s16 // multiply sum with alpha and add to value of C + add CO2, CO2, r3 // compute next pointer + + flds s9, [ CO1 ] // load value from C + flds s13, [ CO2 ] // load value from C + fmuls s9, s9, s1 // multiply with beta + add CO1, CO1, r3 // compute next pointer + fmacs s9, s0, s17 // multiply sum with alpha and add to value of C + add CO2, CO2, r3 // compute next pointer + + flds s10, [ CO1 ] // load value from C + flds s14, [ CO2 ] // load value from C + fmuls s10, s10, s1 // multiply with beta + add CO1, CO1, r3 // compute next pointer + fmacs s10, s0, s18 // multiply sum with alpha and add to value of C + add CO2, CO2, r3 // compute next pointer + + flds s11, [ CO1 ] // load value from C + flds s15, [ CO2 ] // load value from C + fmuls s11, s11, s1 // multiply with beta + mov CO1, r2 // restore pointer + fmacs s11, s0, s19 // multiply sum with alpha and add to value of C + mov CO2, r4 // restore pointer + + fsts s8, [ CO1 ] // store value in C + add CO1 , CO1, r3 // compute next pointer + fsts s9, [ CO1 ] // store value in C + add CO1 , CO1, r3 // compute next pointer + fsts s10, [ CO1 ] // store value in C + add CO1 , CO1, r3 // compute next pointer + fsts s11, [ CO1 ] // store value in C + +//----------------------------------------------------------- + mov r2, CO3 // save pointer + flds s8, [ CO3 ] // load value from C + fmuls s12, s12, s1 // multiply with beta + add CO3, CO3, r3 // compute next pointer + fmacs s12, s0, s20 // multiply sum with alpha and add to value of C + + flds s9, [ CO3 ] // load value from C + fmuls s13, s13, s1 // multiply with beta + add CO3, CO3, r3 // compute next pointer + fmacs s13, s0, s21 // multiply sum with alpha and add to value of C + + flds s10, [ CO3 ] // load value from C + fmuls s14, s14, s1 // multiply with beta + add CO3, CO3, r3 // compute next pointer + fmacs s14, s0, s22 // multiply sum with alpha and add to value of C + + flds s11, [ CO3 ] // load value from C + fmuls s15, s15, s1 // multiply with beta + mov CO3, r2 // restore pointer + fmacs s15, s0, s23 // multiply sum with alpha and add to value of C + + fsts s12, [ CO2 ] // store value in C + add CO2 , CO2, r3 // compute next pointer + fsts s13, [ CO2 ] // store value in C + add CO2 , CO2, r3 // compute next pointer + fsts s14, [ CO2 ] // store value in C + add CO2 , CO2, r3 // compute next pointer + fsts s15, [ CO2 ] // store value in C + +//----------------------------------------------------------- + mov r4, CO4 // save pointer + flds s12, [ CO4 ] // load value from C + fmuls s8, s8, s1 // multiply with beta + add CO4, CO4, r3 // compute next pointer + fmacs s8, s0, s24 // multiply sum with alpha and add to value of C + + flds s13, [ CO4 ] // load value from C + fmuls s9, s9, s1 // multiply with beta + add CO4, CO4, r3 // compute next pointer + fmacs s9, s0, s25 // multiply sum with alpha and add to value of C + + flds s14, [ CO4 ] // load value from C + fmuls s10, s10, s1 // multiply with beta + add CO4, CO4, r3 // compute next pointer + fmacs s10, s0, s26 // multiply sum with alpha and add to value of C + + flds s15, [ CO4 ] // load value from C + fmuls s11, s11, s1 // multiply with beta + mov CO4, r4 // restore pointer + fmacs s11, s0, s27 // multiply sum with alpha and add to value of C + + +//----------------------------------------------------------- + fsts s8, [ CO3 ] // store value in C + fmuls s12, s12, s1 // multiply with beta + add CO3 , CO3, r3 // compute next pointer + fmacs s12, s0, s28 // multiply sum with alpha and add to value of C + + fsts s9, [ CO3 ] // store value in C + fmuls s13, s13, s1 // multiply with beta + add CO3 , CO3, r3 // compute next pointer + fmacs s13, s0, s29 // multiply sum with alpha and add to value of C + + fsts s10, [ CO3 ] // store value in C + fmuls s14, s14, s1 // multiply with beta + add CO3 , CO3, r3 // compute next pointer + fmacs s14, s0, s30 // multiply sum with alpha and add to value of C + + fsts s11, [ CO3 ] // store value in C + fmuls s15, s15, s1 // multiply with beta + fsts s12, [ CO4 ] // store value in C + fmacs s15, s0, s31 // multiply sum with alpha and add to value of C + + add CO4 , CO4, r3 // compute next pointer + fsts s13, [ CO4 ] // store value in C + add CO4 , CO4, r3 // compute next pointer + fsts s14, [ CO4 ] // store value in C + add CO4 , CO4, r3 // compute next pointer + fsts s15, [ CO4 ] // store value in C + +.endm + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + .arm + .global REALNAME + .func REALNAME + +REALNAME: + + push {r4 - r9, fp} // save register + add fp, sp, #28 // add number of saved register multiplied by size of int + sub sp, sp, #STACKSIZE // reserve stack + + mov AO, OLD_A // pointer matrix A + mov BO, OLD_B // pointer matrix B + + sub r3, fp, #128 + vstm r3, { s8 - s31 } // store floating point registers + + ldr r2, OLD_C // pointer matrix C + ldr r3, OLD_CSC // Col stride size of C + lsl r3, r3, #2 // multiply with size of float + + mov CO1, r2 // first line of C + add CO2, CO1, r3 // second line of C + add CO3, CO2, r3 // third line of C + add CO4, CO3, r3 // fourth line of C + + pld [ CO1, #C_PRE ] // prefetch the lines of C + pld [ CO2, #C_PRE ] // prefetch the lines of C + pld [ CO3, #C_PRE ] // prefetch the lines of C + pld [ CO3, #C_PRE ] // prefetch the lines of C + +sgemm_kernel_L4_M4_20: + + asrs L , K, #3 // L = K / 8 + cmp L , #2 + blt sgemm_kernel_L4_M4_32 + + KERNEL4x4_I + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + subs L, L, #2 + ble sgemm_kernel_L4_M4_22a + .align 5 + +sgemm_kernel_L4_M4_22: + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + subs L, L, #1 + bgt sgemm_kernel_L4_M4_22 + +sgemm_kernel_L4_M4_22a: + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_E + + b sgemm_kernel_L4_M4_44 + +sgemm_kernel_L4_M4_32: + + tst L, #1 + ble sgemm_kernel_L4_M4_40 + + KERNEL4x4_I + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_E + + b sgemm_kernel_L4_M4_44 + +sgemm_kernel_L4_M4_40: + + INIT4x4 + +sgemm_kernel_L4_M4_44: + + ands L , K, #7 // L = K % 8 + ble sgemm_kernel_L4_M4_100 + +sgemm_kernel_L4_M4_46: + + KERNEL4x4_SUB + + subs L, L, #1 + bne sgemm_kernel_L4_M4_46 + +sgemm_kernel_L4_M4_100: + + SAVE4x4 + +sgemm_kernel_L999: + + sub r3, fp, #128 + vldm r3, { s8 - s31 } // restore floating point registers + + sub sp, fp, #28 + pop {r4 - r9, fp} + bx lr + diff --git a/config/armv7a/kernels/3/bli_zgemm_kernel_2x2.S b/config/armv7a/kernels/3/bli_zgemm_kernel_2x2.S new file mode 100644 index 000000000..042827d0e --- /dev/null +++ b/config/armv7a/kernels/3/bli_zgemm_kernel_2x2.S @@ -0,0 +1,506 @@ + +#define REALNAME bli_zgemm_kernel_2x2 + +#define STACKSIZE 256 + +#define K r0 +#define PTR_ALPHA r1 +#define OLD_A r2 +#define OLD_B r3 +#define PTR_BETA [fp, #0 ] +#define OLD_C [fp, #4 ] +#define OLD_RSC [fp, #8 ] +#define OLD_CSC [fp, #12 ] +#define AUX [fp, #16 ] + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* register +*******************************************************/ + +#define L r2 + +#define AO r5 +#define BO r6 + +#define CO1 r7 +#define CO2 r8 + + +#define A_PRE 96 +#define B_PRE 96 +#define C_PRE 0 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +#define FMAC_BR fnmacd +#define FMAC_BI fmacd + +#define NN 1 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + + #define FADD_R fsubd + #define FADD_I faddd + + #define FMAC_R1 fnmacd + #define FMAC_R2 fnmacd + #define FMAC_I1 fmacd + #define FMAC_I2 fnmacd + +#elif defined(CN) || defined(CT) + + #define FADD_R faddd + #define FADD_I fsubd + + #define FMAC_R1 fmacd + #define FMAC_R2 fmacd + #define FMAC_I1 fnmacd + #define FMAC_I2 fmacd + +#elif defined(NC) || defined(TC) + + #define FADD_R faddd + #define FADD_I fsubd + + #define FMAC_R1 fmacd + #define FMAC_R2 fnmacd + #define FMAC_I1 fmacd + #define FMAC_I2 fmacd + +#else + + #define FADD_R fsubd + #define FADD_I faddd + + #define FMAC_R1 fnmacd + #define FMAC_R2 fmacd + #define FMAC_I1 fnmacd + #define FMAC_I2 fnmacd + +#endif + + + +.macro INIT2x2 + + vsub.f64 d16 , d16 , d16 + vmov.f64 d17, d16 + vmov.f64 d18, d16 + vmov.f64 d19, d16 + vmov.f64 d20, d16 + vmov.f64 d21, d16 + vmov.f64 d22, d16 + vmov.f64 d23, d16 + vmov.f64 d24, d16 + vmov.f64 d25, d16 + vmov.f64 d26, d16 + vmov.f64 d27, d16 + vmov.f64 d28, d16 + vmov.f64 d29, d16 + vmov.f64 d30, d16 + vmov.f64 d31, d16 + +.endm + +.macro KERNEL2x2_I + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + + fmuld d16 , d0, d8 + fldd d2 , [ AO, #16 ] + fmuld d24 , d1, d9 + fldd d3 , [ AO, #24 ] + fmuld d17 , d0, d9 + fldd d10, [ BO, #16 ] + fmuld d25 , d1, d8 + + fldd d11, [ BO, #24 ] + fmuld d18 , d2, d8 + add BO , BO, #32 + fmuld d26 , d3, d9 + add AO , AO, #32 + fmuld d19 , d2, d9 + pld [ BO , #B_PRE ] + fmuld d27 , d3, d8 + + pld [ AO , #A_PRE ] + fmuld d20 , d0, d10 + fldd d4 , [ AO, #0 ] + fmuld d28 , d1, d11 + fldd d5 , [ AO, #8 ] + fmuld d21 , d0, d11 + fldd d12, [ BO ] + fmuld d29 , d1, d10 + + fldd d13, [ BO, #8 ] + fmuld d22 , d2, d10 + fldd d6 , [ AO, #16 ] + fmuld d30 , d3, d11 + fldd d7 , [ AO, #24 ] + fmuld d23 , d2, d11 + fldd d14, [ BO, #16 ] + fmuld d31 , d3, d10 + fldd d15, [ BO, #24 ] + + add BO , BO, #32 + add AO , AO, #32 +.endm + + + +.macro KERNEL2x2_M1 + pld [ AO , #A_PRE ] + + fmacd d16 , d0, d8 + pld [ BO , #B_PRE ] + fmacd d24 , d1, d9 + fldd d4 , [ AO, #0 ] + fmacd d17 , d0, d9 + fldd d5 , [ AO, #8 ] + fmacd d25 , d1, d8 + + fldd d12, [ BO ] + fmacd d18 , d2, d8 + fldd d13, [ BO, #8 ] + fmacd d26 , d3, d9 + fldd d6 , [ AO, #16 ] + fmacd d19 , d2, d9 + fldd d7 , [ AO, #24 ] + fmacd d27 , d3, d8 + + fmacd d20 , d0, d10 + fldd d14, [ BO, #16 ] + fmacd d28 , d1, d11 + fmacd d21 , d0, d11 + fldd d15, [ BO, #24 ] + fmacd d29 , d1, d10 + + fmacd d22 , d2, d10 + add BO , BO, #32 + fmacd d30 , d3, d11 + fmacd d23 , d2, d11 + add AO , AO, #32 + fmacd d31 , d3, d10 + +.endm + +.macro KERNEL2x2_M2 + pld [ AO , #A_PRE ] + + fmacd d16 , d4, d12 + pld [ BO , #B_PRE ] + fmacd d24 , d5, d13 + fldd d0 , [ AO, #0 ] + fmacd d17 , d4, d13 + fldd d1 , [ AO, #8 ] + fmacd d25 , d5, d12 + + fmacd d18 , d6, d12 + fldd d8 , [ BO ] + fmacd d26 , d7, d13 + fldd d9 , [ BO, #8 ] + fmacd d19 , d6, d13 + fmacd d27 , d7, d12 + + fldd d2 , [ AO, #16 ] + fmacd d20 , d4, d14 + fldd d3 , [ AO, #24 ] + fmacd d28 , d5, d15 + fmacd d21 , d4, d15 + fldd d10, [ BO, #16 ] + fmacd d29 , d5, d14 + + fldd d11, [ BO, #24 ] + fmacd d22 , d6, d14 + fmacd d30 , d7, d15 + add BO , BO, #32 + fmacd d23 , d6, d15 + add AO , AO, #32 + fmacd d31 , d7, d14 + +.endm + + +.macro KERNEL2x2_E + + fmacd d16 , d4, d12 + fmacd d24 , d5, d13 + fmacd d17 , d4, d13 + fmacd d25 , d5, d12 + + fmacd d18 , d6, d12 + fmacd d26 , d7, d13 + fmacd d19 , d6, d13 + fmacd d27 , d7, d12 + + fmacd d20 , d4, d14 + fmacd d28 , d5, d15 + fmacd d21 , d4, d15 + fmacd d29 , d5, d14 + + fmacd d22 , d6, d14 + fmacd d30 , d7, d15 + fmacd d23 , d6, d15 + fmacd d31 , d7, d14 + +.endm + +.macro KERNEL2x2_SUB + + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + + fmacd d16 , d0, d8 + fldd d2 , [ AO, #16 ] + fmacd d24 , d1, d9 + fldd d3 , [ AO, #24 ] + fmacd d17 , d0, d9 + fldd d10, [ BO, #16 ] + fmacd d25 , d1, d8 + + fldd d11, [ BO, #24 ] + fmacd d18 , d2, d8 + fmacd d26 , d3, d9 + fmacd d19 , d2, d9 + fmacd d27 , d3, d8 + + fmacd d20 , d0, d10 + fmacd d28 , d1, d11 + fmacd d21 , d0, d11 + fmacd d29 , d1, d10 + + fmacd d22 , d2, d10 + add BO , BO, #32 + fmacd d30 , d3, d11 + fmacd d23 , d2, d11 + add AO , AO, #32 + fmacd d31 , d3, d10 + +.endm + + + + +.macro SAVE2x2 + + ldr r3, OLD_RSC // Row stride size + lsl r3, r3, #4 // multiply with size of complex double + + fldd d0, [ PTR_ALPHA ] // load real part of alpha + fldd d1, [ PTR_ALPHA, #8 ] // load imag part of alpha + ldr r4, PTR_BETA + fldd d2, [ r4 ] // load real part of beta + fldd d3, [ r4, #8 ] // load imag part of beta + + // Add/Sub the real and the imag parts + FADD_R d16, d24 , d16 + FADD_I d17, d25 , d17 + FADD_R d18, d26 , d18 + FADD_I d19, d27 , d19 + FADD_R d20, d28 , d20 + FADD_I d21, d29 , d21 + FADD_R d22, d30 , d22 + FADD_I d23, d31 , d23 + + mov r4, CO1 // save pointer + fldmiad CO1, { d4 - d5 } // read real and imag part from C + add CO1, CO1, r3 + + mov r2, CO2 // save pointer + fldmiad CO2, { d8 - d9 } // read real and imag part from C + add CO2, CO2, r3 + + fmuld d24, d4, d2 // multiply Beta-real with C-real + fmuld d25, d5, d2 // multiply Beta-real with C-imag + fmuld d28, d8, d2 // multiply Beta-real with C-real + fmuld d29, d9, d2 // multiply Beta-real with C-imag + + FMAC_BR d24, d3, d5 // multiply beta-imag with C-imag and add + FMAC_BI d25, d3, d4 // multiply beta-imag with C-real and add + FMAC_BR d28, d3, d9 // multiply beta-imag with C-imag and add + FMAC_BI d29, d3, d8 // multiply beta-imag with C-real and add + + FMAC_R1 d24 , d0 , d16 + FMAC_I1 d25 , d0 , d17 + FMAC_R2 d24 , d1 , d17 + FMAC_I2 d25 , d1 , d16 + + FMAC_R1 d28 , d0 , d20 + FMAC_I1 d29 , d0 , d21 + FMAC_R2 d28 , d1 , d21 + FMAC_I2 d29 , d1 , d20 + + fldmiad CO1, { d4 - d5 } // read real and imag part from C + fldmiad CO2, { d8 - d9 } // read real and imag part from C + + fmuld d26, d4, d2 // multiply Beta-real with C-real + fmuld d27, d5, d2 // multiply Beta-real with C-imag + fmuld d30, d8, d2 // multiply Beta-real with C-real + fmuld d31, d9, d2 // multiply Beta-real with C-imag + + FMAC_BR d26, d3, d5 // multiply beta-imag with C-imag and add + FMAC_BI d27, d3, d4 // multiply beta-imag with C-real and add + FMAC_BR d30, d3, d9 // multiply beta-imag with C-imag and add + FMAC_BI d31, d3, d8 // multiply beta-imag with C-real and add + + FMAC_R1 d26 , d0 , d18 + FMAC_I1 d27 , d0 , d19 + FMAC_R2 d26 , d1 , d19 + FMAC_I2 d27 , d1 , d18 + + FMAC_R1 d30, d0 , d22 + FMAC_I1 d31, d0 , d23 + FMAC_R2 d30, d1 , d23 + FMAC_I2 d31, d1 , d22 + + mov CO1, r4 // restore pointer + mov CO2, r2 // restore pointer + fstmiad CO1, { d24 - d25 } + fstmiad CO2, { d28 - d29 } + add CO1, CO1, r3 + add CO2, CO2, r3 + fstmiad CO1, { d26 - d27 } + fstmiad CO2, { d30 - d31 } + + +.endm + + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + .arm + .global REALNAME + .func REALNAME + +REALNAME: + + push {r4 - r9, fp} // save register + add fp, sp, #28 // add number of saved register multiplied by size of int + sub sp, sp, #STACKSIZE // reserve stack + + mov AO, OLD_A // pointer matrix A + mov BO, OLD_B // pointer matrix B + + sub r3, fp, #128 + vstm r3, { d8 - d15} // store floating point registers + + ldr r2, OLD_C // pointer matrix C + ldr r3, OLD_CSC // Col stride size of C + lsl r3, r3, #4 // multiply with size of complex double + + mov CO1, r2 // first line of C + add CO2, CO1, r3 // second line of C + + pld [ CO1, #C_PRE ] // prefetch the lines of C + pld [ CO2, #C_PRE ] // prefetch the lines of C + +zgemm_kernel_L2_M2_20: + + asrs L , K, #3 // L = K / 8 + cmp L , #2 + blt zgemm_kernel_L2_M2_32 + + KERNEL2x2_I + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + subs L, L, #2 + ble zgemm_kernel_L2_M2_22a + .align 5 + +zgemm_kernel_L2_M2_22: + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + subs L, L, #1 + bgt zgemm_kernel_L2_M2_22 + +zgemm_kernel_L2_M2_22a: + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_E + + b zgemm_kernel_L2_M2_44 + +zgemm_kernel_L2_M2_32: + + tst L, #1 + ble zgemm_kernel_L2_M2_40 + + KERNEL2x2_I + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_E + + b zgemm_kernel_L2_M2_44 + +zgemm_kernel_L2_M2_40: + + INIT2x2 + +zgemm_kernel_L2_M2_44: + + ands L , K, #7 // L = K % 8 + ble zgemm_kernel_L2_M2_100 + +zgemm_kernel_L2_M2_46: + + KERNEL2x2_SUB + + subs L, L, #1 + bne zgemm_kernel_L2_M2_46 + +zgemm_kernel_L2_M2_100: + + SAVE2x2 + +zgemm_kernel_L999: + + sub r3, fp, #128 + vldm r3, { d8 - d15} // restore floating point registers + + sub sp, fp, #28 + pop {r4 - r9, fp} + bx lr + diff --git a/config/armv7a/make_defs.mk b/config/armv7a/make_defs.mk new file mode 100644 index 000000000..0b13d31e1 --- /dev/null +++ b/config/armv7a/make_defs.mk @@ -0,0 +1,108 @@ +#!/bin/bash +# +# BLIS +# An object-based framework for developing high-performance BLAS-like +# libraries. +# +# Copyright (C) 2014, The University of Texas +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# - Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# - Neither the name of The University of Texas nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# + +# Only include this block of code once. +ifndef MAKE_DEFS_MK_INCLUDED +MAKE_DEFS_MK_INCLUDED := yes + + + +# +# --- Build definitions -------------------------------------------------------- +# + +# Variables corresponding to other configure-time options. +BLIS_ENABLE_VERBOSE_MAKE_OUTPUT := yes +BLIS_ENABLE_STATIC_BUILD := yes +BLIS_ENABLE_DYNAMIC_BUILD := no + + + +# +# --- Utility program definitions ---------------------------------------------- +# + +SH := /bin/sh +MV := mv +MKDIR := mkdir -p +RM_F := rm -f +RM_RF := rm -rf +SYMLINK := ln -sf +FIND := find +GREP := grep +XARGS := xargs +RANLIB := ranlib +INSTALL := install -c + +# Used to refresh CHANGELOG. +GIT := git +GIT_LOG := $(GIT) log --decorate + + + +# +# --- Development tools definitions -------------------------------------------- +# + +# --- Determine the C compiler and related flags --- +CC := gcc +# Enable IEEE Standard 1003.1-2004 (POSIX.1d). +# NOTE: This is needed to enable posix_memalign(). +CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L +CMISCFLAGS := -std=c99 -O3 -mfloat-abi=hard -mfpu=vfpv3 -marm -march=armv7-a #-g +CDBGFLAGS := #-g +CWARNFLAGS := -Wall +COPTFLAGS := -marm -march=armv7-a -mfpu=vfpv3 -O3 -mfloat-abi=hard #-g +CKOPTFLAGS := $(COPTFLAGS) +CVECFLAGS := #-msse3 # -mfpmath=sse + +# Aggregate all of the flags into multiple groups: one for standard +# compilation, and one for each of the supported "special" compilation +# modes. +CFLAGS := $(CDBGFLAGS) $(COPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS) +CFLAGS_KERNELS := $(CDBGFLAGS) $(CKOPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS) +CFLAGS_NOOPT := $(CDBGFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS) + +# --- Determine the archiver and related flags --- +AR := ar +ARFLAGS := cru + +# --- Determine the linker and related flags --- +LINKER := $(CC) +LDFLAGS := -lm + + + +# end of ifndef MAKE_DEFS_MK_INCLUDED conditional block +endif