diff --git a/config/cortex-a15/bli_config.h b/config/cortex-a15/bli_config.h new file mode 100644 index 000000000..2df828311 --- /dev/null +++ b/config/cortex-a15/bli_config.h @@ -0,0 +1,169 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2013, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_CONFIG_H +#define BLIS_CONFIG_H + + +// -- OPERATING SYSTEM --------------------------------------------------------- + + + +// -- INTEGER PROPERTIES ------------------------------------------------------- + +// The bit size of the integer type used to track values such as dimensions, +// strides, diagonal offsets. A value of 32 results in BLIS using 32-bit signed +// integers while 64 results in 64-bit integers. Any other value results in use +// of the C99 type "long int". Note that this ONLY affects integers used +// internally within BLIS as well as those exposed in the native BLAS-like BLIS +// interface. +#define BLIS_INT_TYPE_SIZE 32 + + + +// -- FLOATING-POINT PROPERTIES ------------------------------------------------ + +// Define the number of floating-point types supported, and the size of the +// largest type. +#define BLIS_NUM_FP_TYPES 4 +#define BLIS_MAX_TYPE_SIZE sizeof(dcomplex) + +// Enable use of built-in C99 "float complex" and "double complex" types and +// associated overloaded operations and functions? Disabling results in +// scomplex and dcomplex being defined in terms of simple structs. +//#define BLIS_ENABLE_C99_COMPLEX + + + +// -- MULTITHREADING ----------------------------------------------------------- + +// The maximum number of BLIS threads that will run concurrently. +#define BLIS_MAX_NUM_THREADS 1 + + + +// -- MEMORY ALLOCATION -------------------------------------------------------- + +// -- Contiguous (static) memory allocator -- + +// The number of MC x KC, KC x NC, and MC x NC blocks to reserve in the +// contiguous memory pools. +#define BLIS_NUM_MC_X_KC_BLOCKS BLIS_MAX_NUM_THREADS +#define BLIS_NUM_KC_X_NC_BLOCKS BLIS_MAX_NUM_THREADS +#define BLIS_NUM_MC_X_NC_BLOCKS 0 + +// The maximum preload byte offset is used to pad the end of the contiguous +// memory pools so that the micro-kernel, when computing with the end of the +// last block, can exceed the bounds of the usable portion of the memory +// region without causing a segmentation fault. +#define BLIS_MAX_PRELOAD_BYTE_OFFSET 128 + +// -- Memory alignment -- + +// It is sometimes useful to define the various memory alignments in terms +// of some other characteristics of the system, such as the cache line size +// and the page size. +#define BLIS_CACHE_LINE_SIZE 64 +#define BLIS_PAGE_SIZE 4096 + +// Alignment size needed by the instruction set for aligned SIMD/vector +// instructions. +#define BLIS_SIMD_ALIGN_SIZE 16 + +// Alignment size used to align local stack buffers within macro-kernel +// functions. +#define BLIS_STACK_BUF_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE + +// Alignment size used when allocating memory dynamically from the operating +// system (eg: posix_memalign()). To disable heap alignment and just use +// malloc() instead, set this to 1. +#define BLIS_HEAP_ADDR_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE + +// Alignment size used when sizing leading dimensions of dynamically +// allocated memory. +#define BLIS_HEAP_STRIDE_ALIGN_SIZE BLIS_CACHE_LINE_SIZE + +// Alignment size used when allocating entire blocks of contiguous memory +// from the contiguous memory allocator. +#define BLIS_CONTIG_ADDR_ALIGN_SIZE BLIS_PAGE_SIZE + +// Alignment size used when sizing strides (eg: of packed micro-panels) +// within a block of contiguous memory. +#define BLIS_CONTIG_STRIDE_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE + + + +// -- MIXED DATATYPE SUPPORT --------------------------------------------------- + +// Basic (homogeneous) datatype support always enabled. + +// Enable mixed domain operations? +//#define BLIS_ENABLE_MIXED_DOMAIN_SUPPORT + +// Enable extra mixed precision operations? +//#define BLIS_ENABLE_MIXED_PRECISION_SUPPORT + + + +// -- MISCELLANEOUS OPTIONS ---------------------------------------------------- + +// Stay initialized after auto-initialization, unless and until the user +// explicitly calls bli_finalize(). +#define BLIS_ENABLE_STAY_AUTO_INITIALIZED + + + +// -- BLAS-to-BLIS COMPATIBILITY LAYER ----------------------------------------- + +// Enable the BLAS compatibility layer? +#define BLIS_ENABLE_BLAS2BLIS + +// The bit size of the integer type used to track values such as dimensions and +// leading dimensions (ie: column strides) within the BLAS compatibility layer. +// A value of 32 results in the compatibility layer using 32-bit signed integers +// while 64 results in 64-bit integers. Any other value results in use of the +// C99 type "long int". Note that this ONLY affects integers used within the +// BLAS compatibility layer. +#define BLIS_BLAS2BLIS_INT_TYPE_SIZE 32 + +// Fortran-77 name-mangling macros. +#define PASTEF770(name) name ## _ +#define PASTEF77(ch1,name) ch1 ## name ## _ +#define PASTEF772(ch1,ch2,name) ch1 ## ch2 ## name ## _ + + + + +#endif + diff --git a/config/cortex-a15/bli_kernel.h b/config/cortex-a15/bli_kernel.h new file mode 100644 index 000000000..a8228ea2f --- /dev/null +++ b/config/cortex-a15/bli_kernel.h @@ -0,0 +1,348 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2013, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_KERNEL_H +#define BLIS_KERNEL_H + + +// -- LEVEL-3 MICRO-KERNEL CONSTANTS ------------------------------------------- + +// -- Default cache blocksizes -- + +// +// Constraints: +// +// (1) MC must be a multiple of: +// (a) MR (for zero-padding purposes) +// (b) NR (for zero-padding purposes when MR and NR are "swapped") +// (2) NC must be a multiple of +// (a) NR (for zero-padding purposes) +// (b) MR (for zero-padding purposes when MR and NR are "swapped") +// (3) KC must be a multiple of +// (a) MR and +// (b) NR (for triangular operations such as trmm and trsm). +// + +#define BLIS_DEFAULT_MC_S 336 +#define BLIS_DEFAULT_KC_S 528 +#define BLIS_DEFAULT_NC_S 4096 + +#define BLIS_DEFAULT_MC_D 176 +#define BLIS_DEFAULT_KC_D 368 +#define BLIS_DEFAULT_NC_D 4096 + +#define BLIS_DEFAULT_MC_C 64 +#define BLIS_DEFAULT_KC_C 128 +#define BLIS_DEFAULT_NC_C 4096 + +#define BLIS_DEFAULT_MC_Z 64 +#define BLIS_DEFAULT_KC_Z 128 +#define BLIS_DEFAULT_NC_Z 4096 + +// -- Cache blocksize extensions (for optimizing edge cases) -- + +// NOTE: These cache blocksize "extensions" have the same constraints as +// the corresponding default blocksizes above. When these values are +// non-zero, blocksizes used at edge cases are extended (enlarged) if +// such an extension would encompass the remaining portion of the +// matrix dimension. + +#define BLIS_EXTEND_MC_S 0 //(BLIS_DEFAULT_MC_S/4) +#define BLIS_EXTEND_KC_S 0 //(BLIS_DEFAULT_KC_S/4) +#define BLIS_EXTEND_NC_S 0 //(BLIS_DEFAULT_NC_S/4) + +#define BLIS_EXTEND_MC_D 0 //(BLIS_DEFAULT_MC_D/4) +#define BLIS_EXTEND_KC_D 0 //(BLIS_DEFAULT_KC_D/4) +#define BLIS_EXTEND_NC_D 0 //(BLIS_DEFAULT_NC_D/4) + +#define BLIS_EXTEND_MC_C 0 //(BLIS_DEFAULT_MC_C/4) +#define BLIS_EXTEND_KC_C 0 //(BLIS_DEFAULT_KC_C/4) +#define BLIS_EXTEND_NC_C 0 //(BLIS_DEFAULT_NC_C/4) + +#define BLIS_EXTEND_MC_Z 0 //(BLIS_DEFAULT_MC_Z/4) +#define BLIS_EXTEND_KC_Z 0 //(BLIS_DEFAULT_KC_Z/4) +#define BLIS_EXTEND_NC_Z 0 //(BLIS_DEFAULT_NC_Z/4) + +// -- Default register blocksizes for micro-kernel -- + +// NOTE: When using the reference configuration, these register blocksizes +// in the m and n dimensions should all be equal to the size expected by +// the reference micro-kernel(s). + +#define BLIS_DEFAULT_MR_S 4 +#define BLIS_DEFAULT_NR_S 4 + +#define BLIS_DEFAULT_MR_D 4 +#define BLIS_DEFAULT_NR_D 4 + +#define BLIS_DEFAULT_MR_C 8 +#define BLIS_DEFAULT_NR_C 4 + +#define BLIS_DEFAULT_MR_Z 8 +#define BLIS_DEFAULT_NR_Z 4 + +// NOTE: If the micro-kernel, which is typically unrolled to a factor +// of f, handles leftover edge cases (ie: when k % f > 0) then these +// register blocksizes in the k dimension can be defined to 1. + +#define BLIS_DEFAULT_KR_S 1 +#define BLIS_DEFAULT_KR_D 1 +#define BLIS_DEFAULT_KR_C 1 +#define BLIS_DEFAULT_KR_Z 1 + +// -- Register blocksize extensions (for packed micro-panels) -- + +// NOTE: These register blocksize "extensions" determine whether the +// leading dimensions used within the packed micro-panels are equal to +// or greater than their corresponding register blocksizes above. + +#define BLIS_EXTEND_MR_S 0 +#define BLIS_EXTEND_NR_S 0 + +#define BLIS_EXTEND_MR_D 0 +#define BLIS_EXTEND_NR_D 0 + +#define BLIS_EXTEND_MR_C 0 +#define BLIS_EXTEND_NR_C 0 + +#define BLIS_EXTEND_MR_Z 0 +#define BLIS_EXTEND_NR_Z 0 + +// Register blocksize extensions in the k dimension are not used. + +#define BLIS_EXTEND_KR_S 0 +#define BLIS_EXTEND_KR_D 0 +#define BLIS_EXTEND_KR_C 0 +#define BLIS_EXTEND_KR_Z 0 + +// -- Default incremental packing blocksizes (n dimension) -- + +// NOTE: These incremental packing blocksizes (for the n dimension) are only +// used by certain blocked variants. But when the *are* used, they MUST be +// be an integer multiple of NR! + +#define BLIS_DEFAULT_NI_FAC 16 +#define BLIS_DEFAULT_NI_S (BLIS_DEFAULT_NI_FAC * BLIS_DEFAULT_NR_S) +#define BLIS_DEFAULT_NI_D (BLIS_DEFAULT_NI_FAC * BLIS_DEFAULT_NR_D) +#define BLIS_DEFAULT_NI_C (BLIS_DEFAULT_NI_FAC * BLIS_DEFAULT_NR_C) +#define BLIS_DEFAULT_NI_Z (BLIS_DEFAULT_NI_FAC * BLIS_DEFAULT_NR_Z) + + + +// -- LEVEL-2 KERNEL CONSTANTS ------------------------------------------------- + +// NOTE: These values determine high-level cache blocking for level-2 +// operations ONLY. So, if gemv is performed with a 2000x2000 matrix A and +// MC = NC = 1000, then a total of four unblocked (or unblocked fused) +// gemv subproblems are called. The blocked algorithms are only useful in +// that they provide the opportunity for packing vectors. (Matrices can also +// be packed here, but this tends to be much too expensive in practice to +// actually employ.) + +#define BLIS_DEFAULT_L2_MC_S 1000 +#define BLIS_DEFAULT_L2_NC_S 1000 + +#define BLIS_DEFAULT_L2_MC_D 1000 +#define BLIS_DEFAULT_L2_NC_D 1000 + +#define BLIS_DEFAULT_L2_MC_C 1000 +#define BLIS_DEFAULT_L2_NC_C 1000 + +#define BLIS_DEFAULT_L2_MC_Z 1000 +#define BLIS_DEFAULT_L2_NC_Z 1000 + + + +// -- LEVEL-1F KERNEL CONSTANTS ------------------------------------------------ + +// -- Default fusing factors for level-1f operations -- + +// NOTE: Default fusing factors are not used by the reference implementations +// of level-1f operations. They are here only for use when these operations +// are optimized. + +#define BLIS_DEFAULT_FUSE_FAC_S 8 +#define BLIS_DEFAULT_FUSE_FAC_D 4 +#define BLIS_DEFAULT_FUSE_FAC_C 4 +#define BLIS_DEFAULT_FUSE_FAC_Z 2 + +#define BLIS_AXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S +#define BLIS_AXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D +#define BLIS_AXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C +#define BLIS_AXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z + +#define BLIS_DOTXF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S +#define BLIS_DOTXF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D +#define BLIS_DOTXF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C +#define BLIS_DOTXF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z + +#define BLIS_DOTXAXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S +#define BLIS_DOTXAXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D +#define BLIS_DOTXAXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C +#define BLIS_DOTXAXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z + + + +// -- LEVEL-1V KERNEL CONSTANTS ------------------------------------------------ + +// -- Default register blocksizes for vectors -- + +// NOTE: Register blocksizes for vectors are used when packing +// non-contiguous vectors. Similar to that of KR, they can +// typically be set to 1. + +#define BLIS_DEFAULT_VR_S 1 +#define BLIS_DEFAULT_VR_D 1 +#define BLIS_DEFAULT_VR_C 1 +#define BLIS_DEFAULT_VR_Z 1 + + + +// -- LEVEL-3 KERNEL DEFINITIONS ----------------------------------------------- + +// -- gemm -- + +#include "bli_gemm_opt_4x4.h" +#define GEMM_UKERNEL gemm_opt_4x4 + +// -- trsm-related -- + +#define GEMMTRSM_L_UKERNEL gemmtrsm_l_ref_mxn +#define GEMMTRSM_U_UKERNEL gemmtrsm_u_ref_mxn + +#define TRSM_L_UKERNEL trsm_l_ref_mxn +#define TRSM_U_UKERNEL trsm_u_ref_mxn + + + +// -- LEVEL-1M KERNEL DEFINITIONS ---------------------------------------------- + +// -- packm -- + +#define PACKM_2XK_KERNEL packm_ref_2xk +#define PACKM_4XK_KERNEL packm_ref_4xk +#define PACKM_6XK_KERNEL packm_ref_6xk +#define PACKM_8XK_KERNEL packm_ref_8xk +#define PACKM_10XK_KERNEL packm_ref_10xk +#define PACKM_12XK_KERNEL packm_ref_12xk +#define PACKM_14XK_KERNEL packm_ref_14xk +#define PACKM_16XK_KERNEL packm_ref_16xk + +// -- unpackm -- + +#define UNPACKM_2XK_KERNEL unpackm_ref_2xk +#define UNPACKM_4XK_KERNEL unpackm_ref_4xk +#define UNPACKM_6XK_KERNEL unpackm_ref_6xk +#define UNPACKM_8XK_KERNEL unpackm_ref_8xk +#define UNPACKM_10XK_KERNEL unpackm_ref_10xk +#define UNPACKM_12XK_KERNEL unpackm_ref_12xk +#define UNPACKM_14XK_KERNEL unpackm_ref_14xk +#define UNPACKM_16XK_KERNEL unpackm_ref_16xk + + + +// -- LEVEL-1F KERNEL DEFINITIONS ---------------------------------------------- + +// -- axpy2v -- + +#define AXPY2V_KERNEL axpy2v_unb_var1 + +// -- dotaxpyv -- + +#define DOTAXPYV_KERNEL dotaxpyv_unb_var1 + +// -- axpyf -- + +#define AXPYF_KERNEL axpyf_unb_var1 + +// -- dotxf -- + +#define DOTXF_KERNEL dotxf_unb_var1 + +// -- dotxaxpyf -- + +#define DOTXAXPYF_KERNEL dotxaxpyf_unb_var1 + + + +// -- LEVEL-1V KERNEL DEFINITIONS ---------------------------------------------- + +// -- addv -- + +#define ADDV_KERNEL addv_unb_var1 + +// -- axpyv -- + +#define AXPYV_KERNEL axpyv_unb_var1 + +// -- copyv -- + +#define COPYV_KERNEL copyv_unb_var1 + +// -- dotv -- + +#define DOTV_KERNEL dotv_unb_var1 + +// -- dotxv -- + +#define DOTXV_KERNEL dotxv_unb_var1 + +// -- invertv -- + +#define INVERTV_KERNEL invertv_unb_var1 + +// -- scal2v -- + +#define SCAL2V_KERNEL scal2v_unb_var1 + +// -- scalv -- + +#define SCALV_KERNEL scalv_unb_var1 + +// -- setv -- + +#define SETV_KERNEL setv_unb_var1 + +// -- subv -- + +#define SUBV_KERNEL subv_unb_var1 + +// -- swapv -- + +#define SWAPV_KERNEL swapv_unb_var1 + + + +#endif + diff --git a/config/cortex-a15/kernels b/config/cortex-a15/kernels new file mode 120000 index 000000000..7a25007de --- /dev/null +++ b/config/cortex-a15/kernels @@ -0,0 +1 @@ +../../kernels/arm/neon \ No newline at end of file diff --git a/config/cortex-a15/make_defs.mk b/config/cortex-a15/make_defs.mk new file mode 100644 index 000000000..cfe3789b9 --- /dev/null +++ b/config/cortex-a15/make_defs.mk @@ -0,0 +1,107 @@ +#!/bin/bash +# +# BLIS +# An object-based framework for developing high-performance BLAS-like +# libraries. +# +# Copyright (C) 2013, The University of Texas +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# - Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# - Neither the name of The University of Texas nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# + +# Only include this block of code once. +ifndef MAKE_DEFS_MK_INCLUDED +MAKE_DEFS_MK_INCLUDED := yes + + + +# +# --- Build definitions -------------------------------------------------------- +# + +# Variables corresponding to other configure-time options. +BLIS_ENABLE_VERBOSE_MAKE_OUTPUT := no +BLIS_ENABLE_STATIC_BUILD := yes +BLIS_ENABLE_DYNAMIC_BUILD := no + + + +# +# --- Utility program definitions ---------------------------------------------- +# + +SH := /bin/sh +MV := mv +MKDIR := mkdir -p +RM_F := rm -f +RM_RF := rm -rf +SYMLINK := ln -sf +FIND := find +XARGS := xargs +RANLIB := ranlib +INSTALL := install -c + +# Used to refresh CHANGELOG. +GIT := git +GIT_LOG := $(GIT) log --decorate + + + +# +# --- Development tools definitions -------------------------------------------- +# + +# --- Determine the C compiler and related flags --- +CC := gcc +# Enable IEEE Standard 1003.1-2004 (POSIX.1d). +# NOTE: This is needed to enable posix_memalign(). +CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L +CMISCFLAGS := -std=c99 -mfloat-abi=hard -mfpu=neon +CDBGFLAGS := -g +CWARNFLAGS := -Wall +COPTFLAGS := -march=armv7-a -mfpu=neon -O2 +CKOPTFLAGS := $(COPTFLAGS) +CVECFLAGS := #-msse3 -march=native # -mfpmath=sse + +# Aggregate all of the flags into multiple groups: one for standard +# compilation, and one for each of the supported "special" compilation +# modes. +CFLAGS := $(CDBGFLAGS) $(COPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS) +CFLAGS_KERNELS := $(CDBGFLAGS) $(CKOPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS) +CFLAGS_NOOPT := $(CDBGFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS) + +# --- Determine the archiver and related flags --- +AR := ar +ARFLAGS := cru + +# --- Determine the linker and related flags --- +LINKER := $(CC) +LDFLAGS := + + + +# end of ifndef MAKE_DEFS_MK_INCLUDED conditional block +endif diff --git a/config/cortex-a9/bli_config.h b/config/cortex-a9/bli_config.h new file mode 100644 index 000000000..2df828311 --- /dev/null +++ b/config/cortex-a9/bli_config.h @@ -0,0 +1,169 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2013, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_CONFIG_H +#define BLIS_CONFIG_H + + +// -- OPERATING SYSTEM --------------------------------------------------------- + + + +// -- INTEGER PROPERTIES ------------------------------------------------------- + +// The bit size of the integer type used to track values such as dimensions, +// strides, diagonal offsets. A value of 32 results in BLIS using 32-bit signed +// integers while 64 results in 64-bit integers. Any other value results in use +// of the C99 type "long int". Note that this ONLY affects integers used +// internally within BLIS as well as those exposed in the native BLAS-like BLIS +// interface. +#define BLIS_INT_TYPE_SIZE 32 + + + +// -- FLOATING-POINT PROPERTIES ------------------------------------------------ + +// Define the number of floating-point types supported, and the size of the +// largest type. +#define BLIS_NUM_FP_TYPES 4 +#define BLIS_MAX_TYPE_SIZE sizeof(dcomplex) + +// Enable use of built-in C99 "float complex" and "double complex" types and +// associated overloaded operations and functions? Disabling results in +// scomplex and dcomplex being defined in terms of simple structs. +//#define BLIS_ENABLE_C99_COMPLEX + + + +// -- MULTITHREADING ----------------------------------------------------------- + +// The maximum number of BLIS threads that will run concurrently. +#define BLIS_MAX_NUM_THREADS 1 + + + +// -- MEMORY ALLOCATION -------------------------------------------------------- + +// -- Contiguous (static) memory allocator -- + +// The number of MC x KC, KC x NC, and MC x NC blocks to reserve in the +// contiguous memory pools. +#define BLIS_NUM_MC_X_KC_BLOCKS BLIS_MAX_NUM_THREADS +#define BLIS_NUM_KC_X_NC_BLOCKS BLIS_MAX_NUM_THREADS +#define BLIS_NUM_MC_X_NC_BLOCKS 0 + +// The maximum preload byte offset is used to pad the end of the contiguous +// memory pools so that the micro-kernel, when computing with the end of the +// last block, can exceed the bounds of the usable portion of the memory +// region without causing a segmentation fault. +#define BLIS_MAX_PRELOAD_BYTE_OFFSET 128 + +// -- Memory alignment -- + +// It is sometimes useful to define the various memory alignments in terms +// of some other characteristics of the system, such as the cache line size +// and the page size. +#define BLIS_CACHE_LINE_SIZE 64 +#define BLIS_PAGE_SIZE 4096 + +// Alignment size needed by the instruction set for aligned SIMD/vector +// instructions. +#define BLIS_SIMD_ALIGN_SIZE 16 + +// Alignment size used to align local stack buffers within macro-kernel +// functions. +#define BLIS_STACK_BUF_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE + +// Alignment size used when allocating memory dynamically from the operating +// system (eg: posix_memalign()). To disable heap alignment and just use +// malloc() instead, set this to 1. +#define BLIS_HEAP_ADDR_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE + +// Alignment size used when sizing leading dimensions of dynamically +// allocated memory. +#define BLIS_HEAP_STRIDE_ALIGN_SIZE BLIS_CACHE_LINE_SIZE + +// Alignment size used when allocating entire blocks of contiguous memory +// from the contiguous memory allocator. +#define BLIS_CONTIG_ADDR_ALIGN_SIZE BLIS_PAGE_SIZE + +// Alignment size used when sizing strides (eg: of packed micro-panels) +// within a block of contiguous memory. +#define BLIS_CONTIG_STRIDE_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE + + + +// -- MIXED DATATYPE SUPPORT --------------------------------------------------- + +// Basic (homogeneous) datatype support always enabled. + +// Enable mixed domain operations? +//#define BLIS_ENABLE_MIXED_DOMAIN_SUPPORT + +// Enable extra mixed precision operations? +//#define BLIS_ENABLE_MIXED_PRECISION_SUPPORT + + + +// -- MISCELLANEOUS OPTIONS ---------------------------------------------------- + +// Stay initialized after auto-initialization, unless and until the user +// explicitly calls bli_finalize(). +#define BLIS_ENABLE_STAY_AUTO_INITIALIZED + + + +// -- BLAS-to-BLIS COMPATIBILITY LAYER ----------------------------------------- + +// Enable the BLAS compatibility layer? +#define BLIS_ENABLE_BLAS2BLIS + +// The bit size of the integer type used to track values such as dimensions and +// leading dimensions (ie: column strides) within the BLAS compatibility layer. +// A value of 32 results in the compatibility layer using 32-bit signed integers +// while 64 results in 64-bit integers. Any other value results in use of the +// C99 type "long int". Note that this ONLY affects integers used within the +// BLAS compatibility layer. +#define BLIS_BLAS2BLIS_INT_TYPE_SIZE 32 + +// Fortran-77 name-mangling macros. +#define PASTEF770(name) name ## _ +#define PASTEF77(ch1,name) ch1 ## name ## _ +#define PASTEF772(ch1,ch2,name) ch1 ## ch2 ## name ## _ + + + + +#endif + diff --git a/config/cortex-a9/bli_kernel.h b/config/cortex-a9/bli_kernel.h new file mode 100644 index 000000000..f05687545 --- /dev/null +++ b/config/cortex-a9/bli_kernel.h @@ -0,0 +1,348 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2013, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_KERNEL_H +#define BLIS_KERNEL_H + + +// -- LEVEL-3 MICRO-KERNEL CONSTANTS ------------------------------------------- + +// -- Default cache blocksizes -- + +// +// Constraints: +// +// (1) MC must be a multiple of: +// (a) MR (for zero-padding purposes) +// (b) NR (for zero-padding purposes when MR and NR are "swapped") +// (2) NC must be a multiple of +// (a) NR (for zero-padding purposes) +// (b) MR (for zero-padding purposes when MR and NR are "swapped") +// (3) KC must be a multiple of +// (a) MR and +// (b) NR (for triangular operations such as trmm and trsm). +// + +#define BLIS_DEFAULT_MC_S 432 +#define BLIS_DEFAULT_KC_S 352 +#define BLIS_DEFAULT_NC_S 4096 + +#define BLIS_DEFAULT_MC_D 176 +#define BLIS_DEFAULT_KC_D 368 +#define BLIS_DEFAULT_NC_D 4096 + +#define BLIS_DEFAULT_MC_C 64 +#define BLIS_DEFAULT_KC_C 128 +#define BLIS_DEFAULT_NC_C 4096 + +#define BLIS_DEFAULT_MC_Z 64 +#define BLIS_DEFAULT_KC_Z 128 +#define BLIS_DEFAULT_NC_Z 4096 + +// -- Cache blocksize extensions (for optimizing edge cases) -- + +// NOTE: These cache blocksize "extensions" have the same constraints as +// the corresponding default blocksizes above. When these values are +// non-zero, blocksizes used at edge cases are extended (enlarged) if +// such an extension would encompass the remaining portion of the +// matrix dimension. + +#define BLIS_EXTEND_MC_S 0 //(BLIS_DEFAULT_MC_S/4) +#define BLIS_EXTEND_KC_S 0 //(BLIS_DEFAULT_KC_S/4) +#define BLIS_EXTEND_NC_S 0 //(BLIS_DEFAULT_NC_S/4) + +#define BLIS_EXTEND_MC_D 0 //(BLIS_DEFAULT_MC_D/4) +#define BLIS_EXTEND_KC_D 0 //(BLIS_DEFAULT_KC_D/4) +#define BLIS_EXTEND_NC_D 0 //(BLIS_DEFAULT_NC_D/4) + +#define BLIS_EXTEND_MC_C 0 //(BLIS_DEFAULT_MC_C/4) +#define BLIS_EXTEND_KC_C 0 //(BLIS_DEFAULT_KC_C/4) +#define BLIS_EXTEND_NC_C 0 //(BLIS_DEFAULT_NC_C/4) + +#define BLIS_EXTEND_MC_Z 0 //(BLIS_DEFAULT_MC_Z/4) +#define BLIS_EXTEND_KC_Z 0 //(BLIS_DEFAULT_KC_Z/4) +#define BLIS_EXTEND_NC_Z 0 //(BLIS_DEFAULT_NC_Z/4) + +// -- Default register blocksizes for micro-kernel -- + +// NOTE: When using the reference configuration, these register blocksizes +// in the m and n dimensions should all be equal to the size expected by +// the reference micro-kernel(s). + +#define BLIS_DEFAULT_MR_S 4 +#define BLIS_DEFAULT_NR_S 4 + +#define BLIS_DEFAULT_MR_D 4 +#define BLIS_DEFAULT_NR_D 4 + +#define BLIS_DEFAULT_MR_C 8 +#define BLIS_DEFAULT_NR_C 4 + +#define BLIS_DEFAULT_MR_Z 8 +#define BLIS_DEFAULT_NR_Z 4 + +// NOTE: If the micro-kernel, which is typically unrolled to a factor +// of f, handles leftover edge cases (ie: when k % f > 0) then these +// register blocksizes in the k dimension can be defined to 1. + +#define BLIS_DEFAULT_KR_S 1 +#define BLIS_DEFAULT_KR_D 1 +#define BLIS_DEFAULT_KR_C 1 +#define BLIS_DEFAULT_KR_Z 1 + +// -- Register blocksize extensions (for packed micro-panels) -- + +// NOTE: These register blocksize "extensions" determine whether the +// leading dimensions used within the packed micro-panels are equal to +// or greater than their corresponding register blocksizes above. + +#define BLIS_EXTEND_MR_S 0 +#define BLIS_EXTEND_NR_S 0 + +#define BLIS_EXTEND_MR_D 0 +#define BLIS_EXTEND_NR_D 0 + +#define BLIS_EXTEND_MR_C 0 +#define BLIS_EXTEND_NR_C 0 + +#define BLIS_EXTEND_MR_Z 0 +#define BLIS_EXTEND_NR_Z 0 + +// Register blocksize extensions in the k dimension are not used. + +#define BLIS_EXTEND_KR_S 0 +#define BLIS_EXTEND_KR_D 0 +#define BLIS_EXTEND_KR_C 0 +#define BLIS_EXTEND_KR_Z 0 + +// -- Default incremental packing blocksizes (n dimension) -- + +// NOTE: These incremental packing blocksizes (for the n dimension) are only +// used by certain blocked variants. But when the *are* used, they MUST be +// be an integer multiple of NR! + +#define BLIS_DEFAULT_NI_FAC 16 +#define BLIS_DEFAULT_NI_S (BLIS_DEFAULT_NI_FAC * BLIS_DEFAULT_NR_S) +#define BLIS_DEFAULT_NI_D (BLIS_DEFAULT_NI_FAC * BLIS_DEFAULT_NR_D) +#define BLIS_DEFAULT_NI_C (BLIS_DEFAULT_NI_FAC * BLIS_DEFAULT_NR_C) +#define BLIS_DEFAULT_NI_Z (BLIS_DEFAULT_NI_FAC * BLIS_DEFAULT_NR_Z) + + + +// -- LEVEL-2 KERNEL CONSTANTS ------------------------------------------------- + +// NOTE: These values determine high-level cache blocking for level-2 +// operations ONLY. So, if gemv is performed with a 2000x2000 matrix A and +// MC = NC = 1000, then a total of four unblocked (or unblocked fused) +// gemv subproblems are called. The blocked algorithms are only useful in +// that they provide the opportunity for packing vectors. (Matrices can also +// be packed here, but this tends to be much too expensive in practice to +// actually employ.) + +#define BLIS_DEFAULT_L2_MC_S 1000 +#define BLIS_DEFAULT_L2_NC_S 1000 + +#define BLIS_DEFAULT_L2_MC_D 1000 +#define BLIS_DEFAULT_L2_NC_D 1000 + +#define BLIS_DEFAULT_L2_MC_C 1000 +#define BLIS_DEFAULT_L2_NC_C 1000 + +#define BLIS_DEFAULT_L2_MC_Z 1000 +#define BLIS_DEFAULT_L2_NC_Z 1000 + + + +// -- LEVEL-1F KERNEL CONSTANTS ------------------------------------------------ + +// -- Default fusing factors for level-1f operations -- + +// NOTE: Default fusing factors are not used by the reference implementations +// of level-1f operations. They are here only for use when these operations +// are optimized. + +#define BLIS_DEFAULT_FUSE_FAC_S 8 +#define BLIS_DEFAULT_FUSE_FAC_D 4 +#define BLIS_DEFAULT_FUSE_FAC_C 4 +#define BLIS_DEFAULT_FUSE_FAC_Z 2 + +#define BLIS_AXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S +#define BLIS_AXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D +#define BLIS_AXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C +#define BLIS_AXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z + +#define BLIS_DOTXF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S +#define BLIS_DOTXF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D +#define BLIS_DOTXF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C +#define BLIS_DOTXF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z + +#define BLIS_DOTXAXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S +#define BLIS_DOTXAXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D +#define BLIS_DOTXAXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C +#define BLIS_DOTXAXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z + + + +// -- LEVEL-1V KERNEL CONSTANTS ------------------------------------------------ + +// -- Default register blocksizes for vectors -- + +// NOTE: Register blocksizes for vectors are used when packing +// non-contiguous vectors. Similar to that of KR, they can +// typically be set to 1. + +#define BLIS_DEFAULT_VR_S 1 +#define BLIS_DEFAULT_VR_D 1 +#define BLIS_DEFAULT_VR_C 1 +#define BLIS_DEFAULT_VR_Z 1 + + + +// -- LEVEL-3 KERNEL DEFINITIONS ----------------------------------------------- + +// -- gemm -- + +#include "bli_gemm_opt_4x4.h" +#define GEMM_UKERNEL gemm_opt_4x4 + +// -- trsm-related -- + +#define GEMMTRSM_L_UKERNEL gemmtrsm_l_ref_mxn +#define GEMMTRSM_U_UKERNEL gemmtrsm_u_ref_mxn + +#define TRSM_L_UKERNEL trsm_l_ref_mxn +#define TRSM_U_UKERNEL trsm_u_ref_mxn + + + +// -- LEVEL-1M KERNEL DEFINITIONS ---------------------------------------------- + +// -- packm -- + +#define PACKM_2XK_KERNEL packm_ref_2xk +#define PACKM_4XK_KERNEL packm_ref_4xk +#define PACKM_6XK_KERNEL packm_ref_6xk +#define PACKM_8XK_KERNEL packm_ref_8xk +#define PACKM_10XK_KERNEL packm_ref_10xk +#define PACKM_12XK_KERNEL packm_ref_12xk +#define PACKM_14XK_KERNEL packm_ref_14xk +#define PACKM_16XK_KERNEL packm_ref_16xk + +// -- unpackm -- + +#define UNPACKM_2XK_KERNEL unpackm_ref_2xk +#define UNPACKM_4XK_KERNEL unpackm_ref_4xk +#define UNPACKM_6XK_KERNEL unpackm_ref_6xk +#define UNPACKM_8XK_KERNEL unpackm_ref_8xk +#define UNPACKM_10XK_KERNEL unpackm_ref_10xk +#define UNPACKM_12XK_KERNEL unpackm_ref_12xk +#define UNPACKM_14XK_KERNEL unpackm_ref_14xk +#define UNPACKM_16XK_KERNEL unpackm_ref_16xk + + + +// -- LEVEL-1F KERNEL DEFINITIONS ---------------------------------------------- + +// -- axpy2v -- + +#define AXPY2V_KERNEL axpy2v_unb_var1 + +// -- dotaxpyv -- + +#define DOTAXPYV_KERNEL dotaxpyv_unb_var1 + +// -- axpyf -- + +#define AXPYF_KERNEL axpyf_unb_var1 + +// -- dotxf -- + +#define DOTXF_KERNEL dotxf_unb_var1 + +// -- dotxaxpyf -- + +#define DOTXAXPYF_KERNEL dotxaxpyf_unb_var1 + + + +// -- LEVEL-1V KERNEL DEFINITIONS ---------------------------------------------- + +// -- addv -- + +#define ADDV_KERNEL addv_unb_var1 + +// -- axpyv -- + +#define AXPYV_KERNEL axpyv_unb_var1 + +// -- copyv -- + +#define COPYV_KERNEL copyv_unb_var1 + +// -- dotv -- + +#define DOTV_KERNEL dotv_unb_var1 + +// -- dotxv -- + +#define DOTXV_KERNEL dotxv_unb_var1 + +// -- invertv -- + +#define INVERTV_KERNEL invertv_unb_var1 + +// -- scal2v -- + +#define SCAL2V_KERNEL scal2v_unb_var1 + +// -- scalv -- + +#define SCALV_KERNEL scalv_unb_var1 + +// -- setv -- + +#define SETV_KERNEL setv_unb_var1 + +// -- subv -- + +#define SUBV_KERNEL subv_unb_var1 + +// -- swapv -- + +#define SWAPV_KERNEL swapv_unb_var1 + + + +#endif + diff --git a/config/cortex-a9/kernels b/config/cortex-a9/kernels new file mode 120000 index 000000000..7a25007de --- /dev/null +++ b/config/cortex-a9/kernels @@ -0,0 +1 @@ +../../kernels/arm/neon \ No newline at end of file diff --git a/config/cortex-a9/make_defs.mk b/config/cortex-a9/make_defs.mk new file mode 100644 index 000000000..a3843349d --- /dev/null +++ b/config/cortex-a9/make_defs.mk @@ -0,0 +1,107 @@ +#!/bin/bash +# +# BLIS +# An object-based framework for developing high-performance BLAS-like +# libraries. +# +# Copyright (C) 2013, The University of Texas +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# - Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# - Neither the name of The University of Texas nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# + +# Only include this block of code once. +ifndef MAKE_DEFS_MK_INCLUDED +MAKE_DEFS_MK_INCLUDED := yes + + + +# +# --- Build definitions -------------------------------------------------------- +# + +# Variables corresponding to other configure-time options. +BLIS_ENABLE_VERBOSE_MAKE_OUTPUT := no +BLIS_ENABLE_STATIC_BUILD := yes +BLIS_ENABLE_DYNAMIC_BUILD := no + + + +# +# --- Utility program definitions ---------------------------------------------- +# + +SH := /bin/sh +MV := mv +MKDIR := mkdir -p +RM_F := rm -f +RM_RF := rm -rf +SYMLINK := ln -sf +FIND := find +XARGS := xargs +RANLIB := ranlib +INSTALL := install -c + +# Used to refresh CHANGELOG. +GIT := git +GIT_LOG := $(GIT) log --decorate + + + +# +# --- Development tools definitions -------------------------------------------- +# + +# --- Determine the C compiler and related flags --- +CC := gcc +# Enable IEEE Standard 1003.1-2004 (POSIX.1d). +# NOTE: This is needed to enable posix_memalign(). +CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L +CMISCFLAGS := -std=c99 -mfloat-abi=hard -mfpu=neon +CDBGFLAGS := -g +CWARNFLAGS := -Wall +COPTFLAGS := -march=armv7-a -mfpu=neon -O2 -mfloat-abi=hard +CKOPTFLAGS := $(COPTFLAGS) +CVECFLAGS := #-msse3 -march=native # -mfpmath=sse + +# Aggregate all of the flags into multiple groups: one for standard +# compilation, and one for each of the supported "special" compilation +# modes. +CFLAGS := $(CDBGFLAGS) $(COPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS) +CFLAGS_KERNELS := $(CDBGFLAGS) $(CKOPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS) +CFLAGS_NOOPT := $(CDBGFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS) + +# --- Determine the archiver and related flags --- +AR := ar +ARFLAGS := cru + +# --- Determine the linker and related flags --- +LINKER := $(CC) +LDFLAGS := + + + +# end of ifndef MAKE_DEFS_MK_INCLUDED conditional block +endif diff --git a/kernels/arm/neon/3/bli_gemm_opt_4x4.c b/kernels/arm/neon/3/bli_gemm_opt_4x4.c new file mode 100644 index 000000000..eaa6909b6 --- /dev/null +++ b/kernels/arm/neon/3/bli_gemm_opt_4x4.c @@ -0,0 +1,538 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2012, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +void bli_sgemm_opt_4x4( + dim_t k, + float* restrict alpha, + float* restrict a, + float* restrict b, + float* restrict beta, + float* restrict c, inc_t rs_c, inc_t cs_c, + float* restrict a_next, + float* restrict b_next + ) +{ + float32x4_t alphav; + alphav = vmovq_n_f32( *alpha ); + + float32x4_t av1; + float32x4_t av2; + float32x4_t av3; + float32x4_t av4; + + float32x4_t bv1; + float32x4_t bv2; + float32x4_t bv3; + float32x4_t bv4; + + dim_t k_iter = k/4; + dim_t k_left = k%4; + dim_t i; + + // Vector for column 0 + float32x4_t cv0; + // Vector for column 1 + float32x4_t cv1; + // Vector for column 2 + float32x4_t cv2; + // Vector for column 3 + float32x4_t cv3; + + if( rs_c == 1 ) + { + // Load column 0 + cv0 = vld1q_f32( c + 0*rs_c + 0*cs_c ); + + // Load column 1 + cv1 = vld1q_f32( c + 0*rs_c + 1*cs_c ); + + // Load column 2 + cv2 = vld1q_f32( c + 0*rs_c + 2*cs_c ); + + // Load column 3 + cv3 = vld1q_f32( c + 0*rs_c + 3*cs_c ); + } + else + { + // Load column 0 + cv0 = vld1q_lane_f32( c + 0*rs_c + 0*cs_c, cv0, 0); + cv0 = vld1q_lane_f32( c + 1*rs_c + 0*cs_c, cv0, 1); + cv0 = vld1q_lane_f32( c + 2*rs_c + 0*cs_c, cv0, 2); + cv0 = vld1q_lane_f32( c + 3*rs_c + 0*cs_c, cv0, 3); + + // Load column 1 + cv1 = vld1q_lane_f32( c + 0*rs_c + 1*cs_c, cv1, 0); + cv1 = vld1q_lane_f32( c + 1*rs_c + 1*cs_c, cv1, 1); + cv1 = vld1q_lane_f32( c + 2*rs_c + 1*cs_c, cv1, 2); + cv1 = vld1q_lane_f32( c + 3*rs_c + 1*cs_c, cv1, 3); + + // Load column 2 + cv2 = vld1q_lane_f32( c + 0*rs_c + 2*cs_c, cv2, 0); + cv2 = vld1q_lane_f32( c + 1*rs_c + 2*cs_c, cv2, 1); + cv2 = vld1q_lane_f32( c + 2*rs_c + 2*cs_c, cv2, 2); + cv2 = vld1q_lane_f32( c + 3*rs_c + 2*cs_c, cv2, 3); + + // Load column 3 + cv3 = vld1q_lane_f32( c + 0*rs_c + 3*cs_c, cv3, 0); + cv3 = vld1q_lane_f32( c + 1*rs_c + 3*cs_c, cv3, 1); + cv3 = vld1q_lane_f32( c + 2*rs_c + 3*cs_c, cv3, 2); + cv3 = vld1q_lane_f32( c + 3*rs_c + 3*cs_c, cv3, 3); + + } + + // Vector for accummulating column 0 + float32x4_t abv0; + // Initialize vector to 0.0 + abv0 = vmovq_n_f32( 0.0 ); + + // Vector for accummulating column 1 + float32x4_t abv1; + // Initialize vector to 0.0 + abv1 = vmovq_n_f32( 0.0 ); + + // Vector for accummulating column 2 + float32x4_t abv2; + // Initialize vector to 0.0 + abv2 = vmovq_n_f32( 0.0 ); + + // Vector for accummulating column 3 + float32x4_t abv3; + // Initialize vector to 0.0 + abv3 = vmovq_n_f32( 0.0 ); + + for ( i = 0; i < k_iter; ++i ) + { + // Begin iter 0 + av1 = vld1q_f32( a ); + + __builtin_prefetch( a + 224 ); + __builtin_prefetch( b + 224 ); + + bv1 = vld1q_f32( b ); + + abv0 = vmlaq_lane_f32( abv0, av1, vget_low_f32(bv1), 0 ); + abv1 = vmlaq_lane_f32( abv1, av1, vget_low_f32(bv1), 1 ); + abv2 = vmlaq_lane_f32( abv2, av1, vget_high_f32(bv1), 0 ); + abv3 = vmlaq_lane_f32( abv3, av1, vget_high_f32(bv1), 1 ); + + + av2 = vld1q_f32( a+4 ); + + //__builtin_prefetch( a + 116 ); + //__builtin_prefetch( b + 116 ); + + bv2 = vld1q_f32( b+4 ); + + abv0 = vmlaq_lane_f32( abv0, av2, vget_low_f32(bv2), 0 ); + abv1 = vmlaq_lane_f32( abv1, av2, vget_low_f32(bv2), 1 ); + abv2 = vmlaq_lane_f32( abv2, av2, vget_high_f32(bv2), 0 ); + abv3 = vmlaq_lane_f32( abv3, av2, vget_high_f32(bv2), 1 ); + + av3 = vld1q_f32( a+8 ); + + //__builtin_prefetch( a + 120 ); + //__builtin_prefetch( b + 120 ); + + bv3 = vld1q_f32( b+8 ); + + abv0 = vmlaq_lane_f32( abv0, av3, vget_low_f32(bv3), 0 ); + abv1 = vmlaq_lane_f32( abv1, av3, vget_low_f32(bv3), 1 ); + abv2 = vmlaq_lane_f32( abv2, av3, vget_high_f32(bv3), 0 ); + abv3 = vmlaq_lane_f32( abv3, av3, vget_high_f32(bv3), 1 ); + + + av4 = vld1q_f32( a+12); + + //__builtin_prefetch( a + 124 ); + //__builtin_prefetch( b + 124 ); + + bv4 = vld1q_f32( b+12); + + abv0 = vmlaq_lane_f32( abv0, av4, vget_low_f32(bv4), 0 ); + abv1 = vmlaq_lane_f32( abv1, av4, vget_low_f32(bv4), 1 ); + abv2 = vmlaq_lane_f32( abv2, av4, vget_high_f32(bv4), 0 ); + abv3 = vmlaq_lane_f32( abv3, av4, vget_high_f32(bv4), 1 ); + + + + a += 16; + b += 16; + } + + for ( i = 0; i < k_left; ++i ) + { + av1 = vld1q_f32( a ); + + __builtin_prefetch( a + 112 ); + __builtin_prefetch( b + 112 ); + + bv1 = vld1q_f32( b ); + + abv0 = vmlaq_lane_f32( abv0, av1, vget_low_f32(bv1), 0 ); + abv1 = vmlaq_lane_f32( abv1, av1, vget_low_f32(bv1), 1 ); + abv2 = vmlaq_lane_f32( abv2, av1, vget_high_f32(bv1), 0 ); + abv3 = vmlaq_lane_f32( abv3, av1, vget_high_f32(bv1), 1 ); + + a += 4; + b += 4; + } + + __builtin_prefetch( a_next ); + __builtin_prefetch( b_next ); + + cv0 = vmulq_n_f32( cv0, *beta ); + cv1 = vmulq_n_f32( cv1, *beta ); + cv2 = vmulq_n_f32( cv2, *beta ); + cv3 = vmulq_n_f32( cv3, *beta ); + + cv0 = vmlaq_f32( cv0, abv0, alphav ); + cv1 = vmlaq_f32( cv1, abv1, alphav ); + cv2 = vmlaq_f32( cv2, abv2, alphav ); + cv3 = vmlaq_f32( cv3, abv3, alphav ); + + if( rs_c == 1 ) + { + // Store column 0 + vst1q_f32( c + 0*rs_c + 0*cs_c, cv0 ); + // Store column 1 + vst1q_f32( c + 0*rs_c + 1*cs_c, cv1 ); + // Store column 2 + vst1q_f32( c + 0*rs_c + 2*cs_c, cv2 ); + // Store column 3 + vst1q_f32( c + 0*rs_c + 3*cs_c, cv3 ); + } + else{ + // Store column 0 + vst1q_lane_f32( c + 0*rs_c + 0*cs_c, cv0, 0); + vst1q_lane_f32( c + 1*rs_c + 0*cs_c, cv0, 1); + vst1q_lane_f32( c + 2*rs_c + 0*cs_c, cv0, 2); + vst1q_lane_f32( c + 3*rs_c + 0*cs_c, cv0, 3); + + // Store column 1 + vst1q_lane_f32( c + 0*rs_c + 1*cs_c, cv1, 0); + vst1q_lane_f32( c + 1*rs_c + 1*cs_c, cv1, 1); + vst1q_lane_f32( c + 2*rs_c + 1*cs_c, cv1, 2); + vst1q_lane_f32( c + 3*rs_c + 1*cs_c, cv1, 3); + + // Store column 2 + vst1q_lane_f32( c + 0*rs_c + 2*cs_c, cv2, 0); + vst1q_lane_f32( c + 1*rs_c + 2*cs_c, cv2, 1); + vst1q_lane_f32( c + 2*rs_c + 2*cs_c, cv2, 2); + vst1q_lane_f32( c + 3*rs_c + 2*cs_c, cv2, 3); + + // Store column 3 + vst1q_lane_f32( c + 0*rs_c + 3*cs_c, cv3, 0); + vst1q_lane_f32( c + 1*rs_c + 3*cs_c, cv3, 1); + vst1q_lane_f32( c + 2*rs_c + 3*cs_c, cv3, 2); + vst1q_lane_f32( c + 3*rs_c + 3*cs_c, cv3, 3); + } +} + +void bli_dgemm_opt_4x4( + dim_t k, + double* restrict alpha, + double* restrict a, + double* restrict b, + double* restrict beta, + double* restrict c, inc_t rs_c, inc_t cs_c, + double* restrict a_next, + double* restrict b_next + ) +{ + //dim_t k_iter; + dim_t k_left; + + //k_iter = k / 2; + k_left = k % 2; + + register double a0; + register double a1; + register double a2; + register double a3; + + register double A0; + register double A1; + register double A2; + register double A3; + + double b0, b1, b2, b3; + double B0, B1, B2, B3; + + double ab00, ab01, ab02, ab03; + double ab10, ab11, ab12, ab13; + double ab20, ab21, ab22, ab23; + double ab30, ab31, ab32, ab33; + + double* restrict c00, * restrict c01, * restrict c02, * restrict c03; + double* restrict c10, * restrict c11, * restrict c12, * restrict c13; + double* restrict c20, * restrict c21, * restrict c22, * restrict c23; + double* restrict c30, * restrict c31, * restrict c32, * restrict c33; + + double* restrict ap = a; + double* restrict bp = b; + + double* restrict Ap = a + 4; + double* restrict Bp = b + 4; + + dim_t i; + + c00 = (c + 0*rs_c + 0*cs_c); + c10 = (c + 1*rs_c + 0*cs_c); + c20 = (c + 2*rs_c + 0*cs_c); + c30 = (c + 3*rs_c + 0*cs_c); + + c01 = (c + 0*rs_c + 1*cs_c); + c11 = (c + 1*rs_c + 1*cs_c); + c21 = (c + 2*rs_c + 1*cs_c); + c31 = (c + 3*rs_c + 1*cs_c); + + c02 = (c + 0*rs_c + 2*cs_c); + c12 = (c + 1*rs_c + 2*cs_c); + c22 = (c + 2*rs_c + 2*cs_c); + c32 = (c + 3*rs_c + 2*cs_c); + + c03 = (c + 0*rs_c + 3*cs_c); + c13 = (c + 1*rs_c + 3*cs_c); + c23 = (c + 2*rs_c + 3*cs_c); + c33 = (c + 3*rs_c + 3*cs_c); + + ab00 = 0.0; ab10 = 0.0; ab20 = 0.0; ab30 = 0.0; + ab01 = 0.0; ab11 = 0.0; ab21 = 0.0; ab31 = 0.0; + ab02 = 0.0; ab12 = 0.0; ab22 = 0.0; ab32 = 0.0; + ab03 = 0.0; ab13 = 0.0; ab23 = 0.0; ab33 = 0.0; + + A0 = *(Ap + 0); + A1 = *(Ap + 1); + A2 = *(Ap + 2); + A3 = *(Ap + 3); + + a0 = *(ap + 0); + a1 = *(ap + 1); + a2 = *(ap + 2); + + B0 = *(Bp + 0); + B1 = *(Bp + 1); + B2 = *(Bp + 2); + B3 = *(Bp + 3); + + b0 = *(bp + 0); + b1 = *(bp + 1); + b2 = *(bp + 2); + + double *Aplast = (Ap + 4*k); + + //for ( i = 0; i < k_iter; ++i ) // Unroll by factor 4. + for ( ; Ap != Aplast ; ) // Unroll by factor 4. + { + /* Prefetch */ + //__asm__ ("pld\t[%0],#100\n\t" : :"r"(Ap) : ); + __builtin_prefetch( ap + 112 ); + __builtin_prefetch( Ap + 112 ); + __builtin_prefetch( bp + 112 ); + __builtin_prefetch( Bp + 112 ); + // Iteration 0. + ab00 += A0 * B0; + a3 = *(ap + 3); + ab10 += A1 * B0; + b3 = *(bp + 3); + ab20 += A2 * B0; + ab30 += A3 * B0; + + ab01 += A0 * B1; + ab11 += A1 * B1; + B0 = *(Bp + 8); // Prefetch. + ab21 += A2 * B1; + ab31 += A3 * B1; + + ab02 += A0 * B2; + B1 = *(Bp + 9); + ab12 += A1 * B2; + ab22 += A2 * B2; + ab32 += A3 * B2; + B2 = *(Bp + 10); + + ab03 += A0 * B3; + A0 = *(Ap + 8); // Prefetch. + ab13 += A1 * B3; + A1 = *(Ap + 9); // Prefetch. + ab23 += A2 * B3; + ab33 += A3 * B3; + A2 = *(Ap + 10); // Prefetch. + + // Iteration 1. + //__asm__ ("pld\t[%0],#200\n\t" : :"r"(Ap) : ); + ab00 += a0 * b0; + ab10 += a1 * b0; + A3 = *(Ap + 11); // Prefetch. + ab20 += a2 * b0; + ab30 += a3 * b0; + B3 = *(Bp + 11); + + ab01 += a0 * b1; + b0 = *(bp + 8); + ab11 += a1 * b1; + ab21 += a2 * b1; + ab31 += a3 * b1; + b1 = *(bp + 9); + + ab02 += a0 * b2; + ab12 += a1 * b2; + ab22 += a2 * b2; + ab32 += a3 * b2; + b2 = *(bp + 10); + + ab03 += a0 * b3; + a0 = *(ap + 8); + ab13 += a1 * b3; + a1 = *(ap + 9); + ab23 += a2 * b3; + a2 = *(ap + 10); + ab33 += a3 * b3; + //a3 = *(ap + 11); + + ap += 8; + Ap += 8; + bp += 8; + Bp += 8; + + } + + for ( i = 0; i < k_left; ++i ) + { + a0 = *(a + 0); + a1 = *(a + 1); + a2 = *(a + 2); + a3 = *(a + 3); + + b0 = *(b + 0); + b1 = *(b + 1); + b2 = *(b + 2); + b3 = *(b + 3); + + ab00 += a0 * b0; + ab10 += a1 * b0; + ab20 += a2 * b0; + ab30 += a3 * b0; + + ab01 += a0 * b1; + ab11 += a1 * b1; + ab21 += a2 * b1; + ab31 += a3 * b1; + + ab02 += a0 * b2; + ab12 += a1 * b2; + ab22 += a2 * b2; + ab32 += a3 * b2; + + ab03 += a0 * b3; + ab13 += a1 * b3; + ab23 += a2 * b3; + ab33 += a3 * b3; + + a += 4; + b += 2; + } + + *c00 = *c00 * *beta; + *c10 = *c10 * *beta; + *c20 = *c20 * *beta; + *c30 = *c30 * *beta; + + *c01 = *c01 * *beta; + *c11 = *c11 * *beta; + *c21 = *c21 * *beta; + *c31 = *c31 * *beta; + + *c02 = *c02 * *beta; + *c12 = *c12 * *beta; + *c22 = *c22 * *beta; + *c32 = *c32 * *beta; + + *c03 = *c03 * *beta; + *c13 = *c13 * *beta; + *c23 = *c23 * *beta; + *c33 = *c33 * *beta; + + *c00 += ab00 * *alpha; + *c10 += ab10 * *alpha; + *c20 += ab20 * *alpha; + *c30 += ab30 * *alpha; + + *c01 += ab01 * *alpha; + *c11 += ab11 * *alpha; + *c21 += ab21 * *alpha; + *c31 += ab31 * *alpha; + + *c02 += ab02 * *alpha; + *c12 += ab12 * *alpha; + *c22 += ab22 * *alpha; + *c32 += ab32 * *alpha; + + *c03 += ab03 * *alpha; + *c13 += ab13 * *alpha; + *c23 += ab23 * *alpha; + *c33 += ab33 * *alpha; + +} + +void bli_cgemm_opt_4x4( + dim_t k, + scomplex* restrict alpha, + scomplex* restrict a, + scomplex* restrict b, + scomplex* restrict beta, + scomplex* restrict c, inc_t rs_c, inc_t cs_c, + scomplex* restrict a_next, + scomplex* restrict b_next + ) +{ + bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); +} + +void bli_zgemm_opt_4x4( + dim_t k, + dcomplex* restrict alpha, + dcomplex* restrict a, + dcomplex* restrict b, + dcomplex* restrict beta, + dcomplex* restrict c, inc_t rs_c, inc_t cs_c, + dcomplex* restrict a_next, + dcomplex* restrict b_next + ) +{ + bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); +} + diff --git a/kernels/arm/neon/3/bli_gemm_opt_4x4.h b/kernels/arm/neon/3/bli_gemm_opt_4x4.h new file mode 100644 index 000000000..e81e22cc8 --- /dev/null +++ b/kernels/arm/neon/3/bli_gemm_opt_4x4.h @@ -0,0 +1,52 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2013, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" +#include "arm_neon.h" + +#undef GENTPROT +#define GENTPROT( ctype, ch, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + dim_t k, \ + ctype* restrict alpha, \ + ctype* restrict a, \ + ctype* restrict b, \ + ctype* restrict beta, \ + ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + ctype* restrict a_next, \ + ctype* restrict b_next \ + ); + +INSERT_GENTPROT_BASIC( gemm_opt_4x4 )