Added ARM kernels, configurations.

Details:
- Added kernels for ARM, and configurations for Cortex-A9 and Cortex-A15.
  Thanks to Francisco Igual for contributing these kernels and
  configurations.
This commit is contained in:
Field G. Van Zee
2013-11-16 17:34:25 -06:00
parent d37c2cff62
commit d70733abdd
10 changed files with 1840 additions and 0 deletions

View File

@@ -0,0 +1,169 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2013, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef BLIS_CONFIG_H
#define BLIS_CONFIG_H
// -- OPERATING SYSTEM ---------------------------------------------------------
// -- INTEGER PROPERTIES -------------------------------------------------------
// The bit size of the integer type used to track values such as dimensions,
// strides, diagonal offsets. A value of 32 results in BLIS using 32-bit signed
// integers while 64 results in 64-bit integers. Any other value results in use
// of the C99 type "long int". Note that this ONLY affects integers used
// internally within BLIS as well as those exposed in the native BLAS-like BLIS
// interface.
#define BLIS_INT_TYPE_SIZE 32
// -- FLOATING-POINT PROPERTIES ------------------------------------------------
// Define the number of floating-point types supported, and the size of the
// largest type.
#define BLIS_NUM_FP_TYPES 4
#define BLIS_MAX_TYPE_SIZE sizeof(dcomplex)
// Enable use of built-in C99 "float complex" and "double complex" types and
// associated overloaded operations and functions? Disabling results in
// scomplex and dcomplex being defined in terms of simple structs.
//#define BLIS_ENABLE_C99_COMPLEX
// -- MULTITHREADING -----------------------------------------------------------
// The maximum number of BLIS threads that will run concurrently.
#define BLIS_MAX_NUM_THREADS 1
// -- MEMORY ALLOCATION --------------------------------------------------------
// -- Contiguous (static) memory allocator --
// The number of MC x KC, KC x NC, and MC x NC blocks to reserve in the
// contiguous memory pools.
#define BLIS_NUM_MC_X_KC_BLOCKS BLIS_MAX_NUM_THREADS
#define BLIS_NUM_KC_X_NC_BLOCKS BLIS_MAX_NUM_THREADS
#define BLIS_NUM_MC_X_NC_BLOCKS 0
// The maximum preload byte offset is used to pad the end of the contiguous
// memory pools so that the micro-kernel, when computing with the end of the
// last block, can exceed the bounds of the usable portion of the memory
// region without causing a segmentation fault.
#define BLIS_MAX_PRELOAD_BYTE_OFFSET 128
// -- Memory alignment --
// It is sometimes useful to define the various memory alignments in terms
// of some other characteristics of the system, such as the cache line size
// and the page size.
#define BLIS_CACHE_LINE_SIZE 64
#define BLIS_PAGE_SIZE 4096
// Alignment size needed by the instruction set for aligned SIMD/vector
// instructions.
#define BLIS_SIMD_ALIGN_SIZE 16
// Alignment size used to align local stack buffers within macro-kernel
// functions.
#define BLIS_STACK_BUF_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE
// Alignment size used when allocating memory dynamically from the operating
// system (eg: posix_memalign()). To disable heap alignment and just use
// malloc() instead, set this to 1.
#define BLIS_HEAP_ADDR_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE
// Alignment size used when sizing leading dimensions of dynamically
// allocated memory.
#define BLIS_HEAP_STRIDE_ALIGN_SIZE BLIS_CACHE_LINE_SIZE
// Alignment size used when allocating entire blocks of contiguous memory
// from the contiguous memory allocator.
#define BLIS_CONTIG_ADDR_ALIGN_SIZE BLIS_PAGE_SIZE
// Alignment size used when sizing strides (eg: of packed micro-panels)
// within a block of contiguous memory.
#define BLIS_CONTIG_STRIDE_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE
// -- MIXED DATATYPE SUPPORT ---------------------------------------------------
// Basic (homogeneous) datatype support always enabled.
// Enable mixed domain operations?
//#define BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
// Enable extra mixed precision operations?
//#define BLIS_ENABLE_MIXED_PRECISION_SUPPORT
// -- MISCELLANEOUS OPTIONS ----------------------------------------------------
// Stay initialized after auto-initialization, unless and until the user
// explicitly calls bli_finalize().
#define BLIS_ENABLE_STAY_AUTO_INITIALIZED
// -- BLAS-to-BLIS COMPATIBILITY LAYER -----------------------------------------
// Enable the BLAS compatibility layer?
#define BLIS_ENABLE_BLAS2BLIS
// The bit size of the integer type used to track values such as dimensions and
// leading dimensions (ie: column strides) within the BLAS compatibility layer.
// A value of 32 results in the compatibility layer using 32-bit signed integers
// while 64 results in 64-bit integers. Any other value results in use of the
// C99 type "long int". Note that this ONLY affects integers used within the
// BLAS compatibility layer.
#define BLIS_BLAS2BLIS_INT_TYPE_SIZE 32
// Fortran-77 name-mangling macros.
#define PASTEF770(name) name ## _
#define PASTEF77(ch1,name) ch1 ## name ## _
#define PASTEF772(ch1,ch2,name) ch1 ## ch2 ## name ## _
#endif

View File

@@ -0,0 +1,348 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2013, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef BLIS_KERNEL_H
#define BLIS_KERNEL_H
// -- LEVEL-3 MICRO-KERNEL CONSTANTS -------------------------------------------
// -- Default cache blocksizes --
//
// Constraints:
//
// (1) MC must be a multiple of:
// (a) MR (for zero-padding purposes)
// (b) NR (for zero-padding purposes when MR and NR are "swapped")
// (2) NC must be a multiple of
// (a) NR (for zero-padding purposes)
// (b) MR (for zero-padding purposes when MR and NR are "swapped")
// (3) KC must be a multiple of
// (a) MR and
// (b) NR (for triangular operations such as trmm and trsm).
//
#define BLIS_DEFAULT_MC_S 336
#define BLIS_DEFAULT_KC_S 528
#define BLIS_DEFAULT_NC_S 4096
#define BLIS_DEFAULT_MC_D 176
#define BLIS_DEFAULT_KC_D 368
#define BLIS_DEFAULT_NC_D 4096
#define BLIS_DEFAULT_MC_C 64
#define BLIS_DEFAULT_KC_C 128
#define BLIS_DEFAULT_NC_C 4096
#define BLIS_DEFAULT_MC_Z 64
#define BLIS_DEFAULT_KC_Z 128
#define BLIS_DEFAULT_NC_Z 4096
// -- Cache blocksize extensions (for optimizing edge cases) --
// NOTE: These cache blocksize "extensions" have the same constraints as
// the corresponding default blocksizes above. When these values are
// non-zero, blocksizes used at edge cases are extended (enlarged) if
// such an extension would encompass the remaining portion of the
// matrix dimension.
#define BLIS_EXTEND_MC_S 0 //(BLIS_DEFAULT_MC_S/4)
#define BLIS_EXTEND_KC_S 0 //(BLIS_DEFAULT_KC_S/4)
#define BLIS_EXTEND_NC_S 0 //(BLIS_DEFAULT_NC_S/4)
#define BLIS_EXTEND_MC_D 0 //(BLIS_DEFAULT_MC_D/4)
#define BLIS_EXTEND_KC_D 0 //(BLIS_DEFAULT_KC_D/4)
#define BLIS_EXTEND_NC_D 0 //(BLIS_DEFAULT_NC_D/4)
#define BLIS_EXTEND_MC_C 0 //(BLIS_DEFAULT_MC_C/4)
#define BLIS_EXTEND_KC_C 0 //(BLIS_DEFAULT_KC_C/4)
#define BLIS_EXTEND_NC_C 0 //(BLIS_DEFAULT_NC_C/4)
#define BLIS_EXTEND_MC_Z 0 //(BLIS_DEFAULT_MC_Z/4)
#define BLIS_EXTEND_KC_Z 0 //(BLIS_DEFAULT_KC_Z/4)
#define BLIS_EXTEND_NC_Z 0 //(BLIS_DEFAULT_NC_Z/4)
// -- Default register blocksizes for micro-kernel --
// NOTE: When using the reference configuration, these register blocksizes
// in the m and n dimensions should all be equal to the size expected by
// the reference micro-kernel(s).
#define BLIS_DEFAULT_MR_S 4
#define BLIS_DEFAULT_NR_S 4
#define BLIS_DEFAULT_MR_D 4
#define BLIS_DEFAULT_NR_D 4
#define BLIS_DEFAULT_MR_C 8
#define BLIS_DEFAULT_NR_C 4
#define BLIS_DEFAULT_MR_Z 8
#define BLIS_DEFAULT_NR_Z 4
// NOTE: If the micro-kernel, which is typically unrolled to a factor
// of f, handles leftover edge cases (ie: when k % f > 0) then these
// register blocksizes in the k dimension can be defined to 1.
#define BLIS_DEFAULT_KR_S 1
#define BLIS_DEFAULT_KR_D 1
#define BLIS_DEFAULT_KR_C 1
#define BLIS_DEFAULT_KR_Z 1
// -- Register blocksize extensions (for packed micro-panels) --
// NOTE: These register blocksize "extensions" determine whether the
// leading dimensions used within the packed micro-panels are equal to
// or greater than their corresponding register blocksizes above.
#define BLIS_EXTEND_MR_S 0
#define BLIS_EXTEND_NR_S 0
#define BLIS_EXTEND_MR_D 0
#define BLIS_EXTEND_NR_D 0
#define BLIS_EXTEND_MR_C 0
#define BLIS_EXTEND_NR_C 0
#define BLIS_EXTEND_MR_Z 0
#define BLIS_EXTEND_NR_Z 0
// Register blocksize extensions in the k dimension are not used.
#define BLIS_EXTEND_KR_S 0
#define BLIS_EXTEND_KR_D 0
#define BLIS_EXTEND_KR_C 0
#define BLIS_EXTEND_KR_Z 0
// -- Default incremental packing blocksizes (n dimension) --
// NOTE: These incremental packing blocksizes (for the n dimension) are only
// used by certain blocked variants. But when the *are* used, they MUST be
// be an integer multiple of NR!
#define BLIS_DEFAULT_NI_FAC 16
#define BLIS_DEFAULT_NI_S (BLIS_DEFAULT_NI_FAC * BLIS_DEFAULT_NR_S)
#define BLIS_DEFAULT_NI_D (BLIS_DEFAULT_NI_FAC * BLIS_DEFAULT_NR_D)
#define BLIS_DEFAULT_NI_C (BLIS_DEFAULT_NI_FAC * BLIS_DEFAULT_NR_C)
#define BLIS_DEFAULT_NI_Z (BLIS_DEFAULT_NI_FAC * BLIS_DEFAULT_NR_Z)
// -- LEVEL-2 KERNEL CONSTANTS -------------------------------------------------
// NOTE: These values determine high-level cache blocking for level-2
// operations ONLY. So, if gemv is performed with a 2000x2000 matrix A and
// MC = NC = 1000, then a total of four unblocked (or unblocked fused)
// gemv subproblems are called. The blocked algorithms are only useful in
// that they provide the opportunity for packing vectors. (Matrices can also
// be packed here, but this tends to be much too expensive in practice to
// actually employ.)
#define BLIS_DEFAULT_L2_MC_S 1000
#define BLIS_DEFAULT_L2_NC_S 1000
#define BLIS_DEFAULT_L2_MC_D 1000
#define BLIS_DEFAULT_L2_NC_D 1000
#define BLIS_DEFAULT_L2_MC_C 1000
#define BLIS_DEFAULT_L2_NC_C 1000
#define BLIS_DEFAULT_L2_MC_Z 1000
#define BLIS_DEFAULT_L2_NC_Z 1000
// -- LEVEL-1F KERNEL CONSTANTS ------------------------------------------------
// -- Default fusing factors for level-1f operations --
// NOTE: Default fusing factors are not used by the reference implementations
// of level-1f operations. They are here only for use when these operations
// are optimized.
#define BLIS_DEFAULT_FUSE_FAC_S 8
#define BLIS_DEFAULT_FUSE_FAC_D 4
#define BLIS_DEFAULT_FUSE_FAC_C 4
#define BLIS_DEFAULT_FUSE_FAC_Z 2
#define BLIS_AXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
#define BLIS_AXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
#define BLIS_AXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
#define BLIS_AXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
#define BLIS_DOTXF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
#define BLIS_DOTXF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
#define BLIS_DOTXF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
#define BLIS_DOTXF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
#define BLIS_DOTXAXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
#define BLIS_DOTXAXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
#define BLIS_DOTXAXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
#define BLIS_DOTXAXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
// -- LEVEL-1V KERNEL CONSTANTS ------------------------------------------------
// -- Default register blocksizes for vectors --
// NOTE: Register blocksizes for vectors are used when packing
// non-contiguous vectors. Similar to that of KR, they can
// typically be set to 1.
#define BLIS_DEFAULT_VR_S 1
#define BLIS_DEFAULT_VR_D 1
#define BLIS_DEFAULT_VR_C 1
#define BLIS_DEFAULT_VR_Z 1
// -- LEVEL-3 KERNEL DEFINITIONS -----------------------------------------------
// -- gemm --
#include "bli_gemm_opt_4x4.h"
#define GEMM_UKERNEL gemm_opt_4x4
// -- trsm-related --
#define GEMMTRSM_L_UKERNEL gemmtrsm_l_ref_mxn
#define GEMMTRSM_U_UKERNEL gemmtrsm_u_ref_mxn
#define TRSM_L_UKERNEL trsm_l_ref_mxn
#define TRSM_U_UKERNEL trsm_u_ref_mxn
// -- LEVEL-1M KERNEL DEFINITIONS ----------------------------------------------
// -- packm --
#define PACKM_2XK_KERNEL packm_ref_2xk
#define PACKM_4XK_KERNEL packm_ref_4xk
#define PACKM_6XK_KERNEL packm_ref_6xk
#define PACKM_8XK_KERNEL packm_ref_8xk
#define PACKM_10XK_KERNEL packm_ref_10xk
#define PACKM_12XK_KERNEL packm_ref_12xk
#define PACKM_14XK_KERNEL packm_ref_14xk
#define PACKM_16XK_KERNEL packm_ref_16xk
// -- unpackm --
#define UNPACKM_2XK_KERNEL unpackm_ref_2xk
#define UNPACKM_4XK_KERNEL unpackm_ref_4xk
#define UNPACKM_6XK_KERNEL unpackm_ref_6xk
#define UNPACKM_8XK_KERNEL unpackm_ref_8xk
#define UNPACKM_10XK_KERNEL unpackm_ref_10xk
#define UNPACKM_12XK_KERNEL unpackm_ref_12xk
#define UNPACKM_14XK_KERNEL unpackm_ref_14xk
#define UNPACKM_16XK_KERNEL unpackm_ref_16xk
// -- LEVEL-1F KERNEL DEFINITIONS ----------------------------------------------
// -- axpy2v --
#define AXPY2V_KERNEL axpy2v_unb_var1
// -- dotaxpyv --
#define DOTAXPYV_KERNEL dotaxpyv_unb_var1
// -- axpyf --
#define AXPYF_KERNEL axpyf_unb_var1
// -- dotxf --
#define DOTXF_KERNEL dotxf_unb_var1
// -- dotxaxpyf --
#define DOTXAXPYF_KERNEL dotxaxpyf_unb_var1
// -- LEVEL-1V KERNEL DEFINITIONS ----------------------------------------------
// -- addv --
#define ADDV_KERNEL addv_unb_var1
// -- axpyv --
#define AXPYV_KERNEL axpyv_unb_var1
// -- copyv --
#define COPYV_KERNEL copyv_unb_var1
// -- dotv --
#define DOTV_KERNEL dotv_unb_var1
// -- dotxv --
#define DOTXV_KERNEL dotxv_unb_var1
// -- invertv --
#define INVERTV_KERNEL invertv_unb_var1
// -- scal2v --
#define SCAL2V_KERNEL scal2v_unb_var1
// -- scalv --
#define SCALV_KERNEL scalv_unb_var1
// -- setv --
#define SETV_KERNEL setv_unb_var1
// -- subv --
#define SUBV_KERNEL subv_unb_var1
// -- swapv --
#define SWAPV_KERNEL swapv_unb_var1
#endif

1
config/cortex-a15/kernels Symbolic link
View File

@@ -0,0 +1 @@
../../kernels/arm/neon

View File

@@ -0,0 +1,107 @@
#!/bin/bash
#
# BLIS
# An object-based framework for developing high-performance BLAS-like
# libraries.
#
# Copyright (C) 2013, The University of Texas
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
# - Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# - Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# - Neither the name of The University of Texas nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#
# Only include this block of code once.
ifndef MAKE_DEFS_MK_INCLUDED
MAKE_DEFS_MK_INCLUDED := yes
#
# --- Build definitions --------------------------------------------------------
#
# Variables corresponding to other configure-time options.
BLIS_ENABLE_VERBOSE_MAKE_OUTPUT := no
BLIS_ENABLE_STATIC_BUILD := yes
BLIS_ENABLE_DYNAMIC_BUILD := no
#
# --- Utility program definitions ----------------------------------------------
#
SH := /bin/sh
MV := mv
MKDIR := mkdir -p
RM_F := rm -f
RM_RF := rm -rf
SYMLINK := ln -sf
FIND := find
XARGS := xargs
RANLIB := ranlib
INSTALL := install -c
# Used to refresh CHANGELOG.
GIT := git
GIT_LOG := $(GIT) log --decorate
#
# --- Development tools definitions --------------------------------------------
#
# --- Determine the C compiler and related flags ---
CC := gcc
# Enable IEEE Standard 1003.1-2004 (POSIX.1d).
# NOTE: This is needed to enable posix_memalign().
CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L
CMISCFLAGS := -std=c99 -mfloat-abi=hard -mfpu=neon
CDBGFLAGS := -g
CWARNFLAGS := -Wall
COPTFLAGS := -march=armv7-a -mfpu=neon -O2
CKOPTFLAGS := $(COPTFLAGS)
CVECFLAGS := #-msse3 -march=native # -mfpmath=sse
# Aggregate all of the flags into multiple groups: one for standard
# compilation, and one for each of the supported "special" compilation
# modes.
CFLAGS := $(CDBGFLAGS) $(COPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
CFLAGS_KERNELS := $(CDBGFLAGS) $(CKOPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
CFLAGS_NOOPT := $(CDBGFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
# --- Determine the archiver and related flags ---
AR := ar
ARFLAGS := cru
# --- Determine the linker and related flags ---
LINKER := $(CC)
LDFLAGS :=
# end of ifndef MAKE_DEFS_MK_INCLUDED conditional block
endif

View File

@@ -0,0 +1,169 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2013, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef BLIS_CONFIG_H
#define BLIS_CONFIG_H
// -- OPERATING SYSTEM ---------------------------------------------------------
// -- INTEGER PROPERTIES -------------------------------------------------------
// The bit size of the integer type used to track values such as dimensions,
// strides, diagonal offsets. A value of 32 results in BLIS using 32-bit signed
// integers while 64 results in 64-bit integers. Any other value results in use
// of the C99 type "long int". Note that this ONLY affects integers used
// internally within BLIS as well as those exposed in the native BLAS-like BLIS
// interface.
#define BLIS_INT_TYPE_SIZE 32
// -- FLOATING-POINT PROPERTIES ------------------------------------------------
// Define the number of floating-point types supported, and the size of the
// largest type.
#define BLIS_NUM_FP_TYPES 4
#define BLIS_MAX_TYPE_SIZE sizeof(dcomplex)
// Enable use of built-in C99 "float complex" and "double complex" types and
// associated overloaded operations and functions? Disabling results in
// scomplex and dcomplex being defined in terms of simple structs.
//#define BLIS_ENABLE_C99_COMPLEX
// -- MULTITHREADING -----------------------------------------------------------
// The maximum number of BLIS threads that will run concurrently.
#define BLIS_MAX_NUM_THREADS 1
// -- MEMORY ALLOCATION --------------------------------------------------------
// -- Contiguous (static) memory allocator --
// The number of MC x KC, KC x NC, and MC x NC blocks to reserve in the
// contiguous memory pools.
#define BLIS_NUM_MC_X_KC_BLOCKS BLIS_MAX_NUM_THREADS
#define BLIS_NUM_KC_X_NC_BLOCKS BLIS_MAX_NUM_THREADS
#define BLIS_NUM_MC_X_NC_BLOCKS 0
// The maximum preload byte offset is used to pad the end of the contiguous
// memory pools so that the micro-kernel, when computing with the end of the
// last block, can exceed the bounds of the usable portion of the memory
// region without causing a segmentation fault.
#define BLIS_MAX_PRELOAD_BYTE_OFFSET 128
// -- Memory alignment --
// It is sometimes useful to define the various memory alignments in terms
// of some other characteristics of the system, such as the cache line size
// and the page size.
#define BLIS_CACHE_LINE_SIZE 64
#define BLIS_PAGE_SIZE 4096
// Alignment size needed by the instruction set for aligned SIMD/vector
// instructions.
#define BLIS_SIMD_ALIGN_SIZE 16
// Alignment size used to align local stack buffers within macro-kernel
// functions.
#define BLIS_STACK_BUF_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE
// Alignment size used when allocating memory dynamically from the operating
// system (eg: posix_memalign()). To disable heap alignment and just use
// malloc() instead, set this to 1.
#define BLIS_HEAP_ADDR_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE
// Alignment size used when sizing leading dimensions of dynamically
// allocated memory.
#define BLIS_HEAP_STRIDE_ALIGN_SIZE BLIS_CACHE_LINE_SIZE
// Alignment size used when allocating entire blocks of contiguous memory
// from the contiguous memory allocator.
#define BLIS_CONTIG_ADDR_ALIGN_SIZE BLIS_PAGE_SIZE
// Alignment size used when sizing strides (eg: of packed micro-panels)
// within a block of contiguous memory.
#define BLIS_CONTIG_STRIDE_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE
// -- MIXED DATATYPE SUPPORT ---------------------------------------------------
// Basic (homogeneous) datatype support always enabled.
// Enable mixed domain operations?
//#define BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
// Enable extra mixed precision operations?
//#define BLIS_ENABLE_MIXED_PRECISION_SUPPORT
// -- MISCELLANEOUS OPTIONS ----------------------------------------------------
// Stay initialized after auto-initialization, unless and until the user
// explicitly calls bli_finalize().
#define BLIS_ENABLE_STAY_AUTO_INITIALIZED
// -- BLAS-to-BLIS COMPATIBILITY LAYER -----------------------------------------
// Enable the BLAS compatibility layer?
#define BLIS_ENABLE_BLAS2BLIS
// The bit size of the integer type used to track values such as dimensions and
// leading dimensions (ie: column strides) within the BLAS compatibility layer.
// A value of 32 results in the compatibility layer using 32-bit signed integers
// while 64 results in 64-bit integers. Any other value results in use of the
// C99 type "long int". Note that this ONLY affects integers used within the
// BLAS compatibility layer.
#define BLIS_BLAS2BLIS_INT_TYPE_SIZE 32
// Fortran-77 name-mangling macros.
#define PASTEF770(name) name ## _
#define PASTEF77(ch1,name) ch1 ## name ## _
#define PASTEF772(ch1,ch2,name) ch1 ## ch2 ## name ## _
#endif

View File

@@ -0,0 +1,348 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2013, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef BLIS_KERNEL_H
#define BLIS_KERNEL_H
// -- LEVEL-3 MICRO-KERNEL CONSTANTS -------------------------------------------
// -- Default cache blocksizes --
//
// Constraints:
//
// (1) MC must be a multiple of:
// (a) MR (for zero-padding purposes)
// (b) NR (for zero-padding purposes when MR and NR are "swapped")
// (2) NC must be a multiple of
// (a) NR (for zero-padding purposes)
// (b) MR (for zero-padding purposes when MR and NR are "swapped")
// (3) KC must be a multiple of
// (a) MR and
// (b) NR (for triangular operations such as trmm and trsm).
//
#define BLIS_DEFAULT_MC_S 432
#define BLIS_DEFAULT_KC_S 352
#define BLIS_DEFAULT_NC_S 4096
#define BLIS_DEFAULT_MC_D 176
#define BLIS_DEFAULT_KC_D 368
#define BLIS_DEFAULT_NC_D 4096
#define BLIS_DEFAULT_MC_C 64
#define BLIS_DEFAULT_KC_C 128
#define BLIS_DEFAULT_NC_C 4096
#define BLIS_DEFAULT_MC_Z 64
#define BLIS_DEFAULT_KC_Z 128
#define BLIS_DEFAULT_NC_Z 4096
// -- Cache blocksize extensions (for optimizing edge cases) --
// NOTE: These cache blocksize "extensions" have the same constraints as
// the corresponding default blocksizes above. When these values are
// non-zero, blocksizes used at edge cases are extended (enlarged) if
// such an extension would encompass the remaining portion of the
// matrix dimension.
#define BLIS_EXTEND_MC_S 0 //(BLIS_DEFAULT_MC_S/4)
#define BLIS_EXTEND_KC_S 0 //(BLIS_DEFAULT_KC_S/4)
#define BLIS_EXTEND_NC_S 0 //(BLIS_DEFAULT_NC_S/4)
#define BLIS_EXTEND_MC_D 0 //(BLIS_DEFAULT_MC_D/4)
#define BLIS_EXTEND_KC_D 0 //(BLIS_DEFAULT_KC_D/4)
#define BLIS_EXTEND_NC_D 0 //(BLIS_DEFAULT_NC_D/4)
#define BLIS_EXTEND_MC_C 0 //(BLIS_DEFAULT_MC_C/4)
#define BLIS_EXTEND_KC_C 0 //(BLIS_DEFAULT_KC_C/4)
#define BLIS_EXTEND_NC_C 0 //(BLIS_DEFAULT_NC_C/4)
#define BLIS_EXTEND_MC_Z 0 //(BLIS_DEFAULT_MC_Z/4)
#define BLIS_EXTEND_KC_Z 0 //(BLIS_DEFAULT_KC_Z/4)
#define BLIS_EXTEND_NC_Z 0 //(BLIS_DEFAULT_NC_Z/4)
// -- Default register blocksizes for micro-kernel --
// NOTE: When using the reference configuration, these register blocksizes
// in the m and n dimensions should all be equal to the size expected by
// the reference micro-kernel(s).
#define BLIS_DEFAULT_MR_S 4
#define BLIS_DEFAULT_NR_S 4
#define BLIS_DEFAULT_MR_D 4
#define BLIS_DEFAULT_NR_D 4
#define BLIS_DEFAULT_MR_C 8
#define BLIS_DEFAULT_NR_C 4
#define BLIS_DEFAULT_MR_Z 8
#define BLIS_DEFAULT_NR_Z 4
// NOTE: If the micro-kernel, which is typically unrolled to a factor
// of f, handles leftover edge cases (ie: when k % f > 0) then these
// register blocksizes in the k dimension can be defined to 1.
#define BLIS_DEFAULT_KR_S 1
#define BLIS_DEFAULT_KR_D 1
#define BLIS_DEFAULT_KR_C 1
#define BLIS_DEFAULT_KR_Z 1
// -- Register blocksize extensions (for packed micro-panels) --
// NOTE: These register blocksize "extensions" determine whether the
// leading dimensions used within the packed micro-panels are equal to
// or greater than their corresponding register blocksizes above.
#define BLIS_EXTEND_MR_S 0
#define BLIS_EXTEND_NR_S 0
#define BLIS_EXTEND_MR_D 0
#define BLIS_EXTEND_NR_D 0
#define BLIS_EXTEND_MR_C 0
#define BLIS_EXTEND_NR_C 0
#define BLIS_EXTEND_MR_Z 0
#define BLIS_EXTEND_NR_Z 0
// Register blocksize extensions in the k dimension are not used.
#define BLIS_EXTEND_KR_S 0
#define BLIS_EXTEND_KR_D 0
#define BLIS_EXTEND_KR_C 0
#define BLIS_EXTEND_KR_Z 0
// -- Default incremental packing blocksizes (n dimension) --
// NOTE: These incremental packing blocksizes (for the n dimension) are only
// used by certain blocked variants. But when the *are* used, they MUST be
// be an integer multiple of NR!
#define BLIS_DEFAULT_NI_FAC 16
#define BLIS_DEFAULT_NI_S (BLIS_DEFAULT_NI_FAC * BLIS_DEFAULT_NR_S)
#define BLIS_DEFAULT_NI_D (BLIS_DEFAULT_NI_FAC * BLIS_DEFAULT_NR_D)
#define BLIS_DEFAULT_NI_C (BLIS_DEFAULT_NI_FAC * BLIS_DEFAULT_NR_C)
#define BLIS_DEFAULT_NI_Z (BLIS_DEFAULT_NI_FAC * BLIS_DEFAULT_NR_Z)
// -- LEVEL-2 KERNEL CONSTANTS -------------------------------------------------
// NOTE: These values determine high-level cache blocking for level-2
// operations ONLY. So, if gemv is performed with a 2000x2000 matrix A and
// MC = NC = 1000, then a total of four unblocked (or unblocked fused)
// gemv subproblems are called. The blocked algorithms are only useful in
// that they provide the opportunity for packing vectors. (Matrices can also
// be packed here, but this tends to be much too expensive in practice to
// actually employ.)
#define BLIS_DEFAULT_L2_MC_S 1000
#define BLIS_DEFAULT_L2_NC_S 1000
#define BLIS_DEFAULT_L2_MC_D 1000
#define BLIS_DEFAULT_L2_NC_D 1000
#define BLIS_DEFAULT_L2_MC_C 1000
#define BLIS_DEFAULT_L2_NC_C 1000
#define BLIS_DEFAULT_L2_MC_Z 1000
#define BLIS_DEFAULT_L2_NC_Z 1000
// -- LEVEL-1F KERNEL CONSTANTS ------------------------------------------------
// -- Default fusing factors for level-1f operations --
// NOTE: Default fusing factors are not used by the reference implementations
// of level-1f operations. They are here only for use when these operations
// are optimized.
#define BLIS_DEFAULT_FUSE_FAC_S 8
#define BLIS_DEFAULT_FUSE_FAC_D 4
#define BLIS_DEFAULT_FUSE_FAC_C 4
#define BLIS_DEFAULT_FUSE_FAC_Z 2
#define BLIS_AXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
#define BLIS_AXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
#define BLIS_AXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
#define BLIS_AXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
#define BLIS_DOTXF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
#define BLIS_DOTXF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
#define BLIS_DOTXF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
#define BLIS_DOTXF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
#define BLIS_DOTXAXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
#define BLIS_DOTXAXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
#define BLIS_DOTXAXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
#define BLIS_DOTXAXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
// -- LEVEL-1V KERNEL CONSTANTS ------------------------------------------------
// -- Default register blocksizes for vectors --
// NOTE: Register blocksizes for vectors are used when packing
// non-contiguous vectors. Similar to that of KR, they can
// typically be set to 1.
#define BLIS_DEFAULT_VR_S 1
#define BLIS_DEFAULT_VR_D 1
#define BLIS_DEFAULT_VR_C 1
#define BLIS_DEFAULT_VR_Z 1
// -- LEVEL-3 KERNEL DEFINITIONS -----------------------------------------------
// -- gemm --
#include "bli_gemm_opt_4x4.h"
#define GEMM_UKERNEL gemm_opt_4x4
// -- trsm-related --
#define GEMMTRSM_L_UKERNEL gemmtrsm_l_ref_mxn
#define GEMMTRSM_U_UKERNEL gemmtrsm_u_ref_mxn
#define TRSM_L_UKERNEL trsm_l_ref_mxn
#define TRSM_U_UKERNEL trsm_u_ref_mxn
// -- LEVEL-1M KERNEL DEFINITIONS ----------------------------------------------
// -- packm --
#define PACKM_2XK_KERNEL packm_ref_2xk
#define PACKM_4XK_KERNEL packm_ref_4xk
#define PACKM_6XK_KERNEL packm_ref_6xk
#define PACKM_8XK_KERNEL packm_ref_8xk
#define PACKM_10XK_KERNEL packm_ref_10xk
#define PACKM_12XK_KERNEL packm_ref_12xk
#define PACKM_14XK_KERNEL packm_ref_14xk
#define PACKM_16XK_KERNEL packm_ref_16xk
// -- unpackm --
#define UNPACKM_2XK_KERNEL unpackm_ref_2xk
#define UNPACKM_4XK_KERNEL unpackm_ref_4xk
#define UNPACKM_6XK_KERNEL unpackm_ref_6xk
#define UNPACKM_8XK_KERNEL unpackm_ref_8xk
#define UNPACKM_10XK_KERNEL unpackm_ref_10xk
#define UNPACKM_12XK_KERNEL unpackm_ref_12xk
#define UNPACKM_14XK_KERNEL unpackm_ref_14xk
#define UNPACKM_16XK_KERNEL unpackm_ref_16xk
// -- LEVEL-1F KERNEL DEFINITIONS ----------------------------------------------
// -- axpy2v --
#define AXPY2V_KERNEL axpy2v_unb_var1
// -- dotaxpyv --
#define DOTAXPYV_KERNEL dotaxpyv_unb_var1
// -- axpyf --
#define AXPYF_KERNEL axpyf_unb_var1
// -- dotxf --
#define DOTXF_KERNEL dotxf_unb_var1
// -- dotxaxpyf --
#define DOTXAXPYF_KERNEL dotxaxpyf_unb_var1
// -- LEVEL-1V KERNEL DEFINITIONS ----------------------------------------------
// -- addv --
#define ADDV_KERNEL addv_unb_var1
// -- axpyv --
#define AXPYV_KERNEL axpyv_unb_var1
// -- copyv --
#define COPYV_KERNEL copyv_unb_var1
// -- dotv --
#define DOTV_KERNEL dotv_unb_var1
// -- dotxv --
#define DOTXV_KERNEL dotxv_unb_var1
// -- invertv --
#define INVERTV_KERNEL invertv_unb_var1
// -- scal2v --
#define SCAL2V_KERNEL scal2v_unb_var1
// -- scalv --
#define SCALV_KERNEL scalv_unb_var1
// -- setv --
#define SETV_KERNEL setv_unb_var1
// -- subv --
#define SUBV_KERNEL subv_unb_var1
// -- swapv --
#define SWAPV_KERNEL swapv_unb_var1
#endif

1
config/cortex-a9/kernels Symbolic link
View File

@@ -0,0 +1 @@
../../kernels/arm/neon

View File

@@ -0,0 +1,107 @@
#!/bin/bash
#
# BLIS
# An object-based framework for developing high-performance BLAS-like
# libraries.
#
# Copyright (C) 2013, The University of Texas
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
# - Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# - Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# - Neither the name of The University of Texas nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#
# Only include this block of code once.
ifndef MAKE_DEFS_MK_INCLUDED
MAKE_DEFS_MK_INCLUDED := yes
#
# --- Build definitions --------------------------------------------------------
#
# Variables corresponding to other configure-time options.
BLIS_ENABLE_VERBOSE_MAKE_OUTPUT := no
BLIS_ENABLE_STATIC_BUILD := yes
BLIS_ENABLE_DYNAMIC_BUILD := no
#
# --- Utility program definitions ----------------------------------------------
#
SH := /bin/sh
MV := mv
MKDIR := mkdir -p
RM_F := rm -f
RM_RF := rm -rf
SYMLINK := ln -sf
FIND := find
XARGS := xargs
RANLIB := ranlib
INSTALL := install -c
# Used to refresh CHANGELOG.
GIT := git
GIT_LOG := $(GIT) log --decorate
#
# --- Development tools definitions --------------------------------------------
#
# --- Determine the C compiler and related flags ---
CC := gcc
# Enable IEEE Standard 1003.1-2004 (POSIX.1d).
# NOTE: This is needed to enable posix_memalign().
CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L
CMISCFLAGS := -std=c99 -mfloat-abi=hard -mfpu=neon
CDBGFLAGS := -g
CWARNFLAGS := -Wall
COPTFLAGS := -march=armv7-a -mfpu=neon -O2 -mfloat-abi=hard
CKOPTFLAGS := $(COPTFLAGS)
CVECFLAGS := #-msse3 -march=native # -mfpmath=sse
# Aggregate all of the flags into multiple groups: one for standard
# compilation, and one for each of the supported "special" compilation
# modes.
CFLAGS := $(CDBGFLAGS) $(COPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
CFLAGS_KERNELS := $(CDBGFLAGS) $(CKOPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
CFLAGS_NOOPT := $(CDBGFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
# --- Determine the archiver and related flags ---
AR := ar
ARFLAGS := cru
# --- Determine the linker and related flags ---
LINKER := $(CC)
LDFLAGS :=
# end of ifndef MAKE_DEFS_MK_INCLUDED conditional block
endif

View File

@@ -0,0 +1,538 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2012, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
void bli_sgemm_opt_4x4(
dim_t k,
float* restrict alpha,
float* restrict a,
float* restrict b,
float* restrict beta,
float* restrict c, inc_t rs_c, inc_t cs_c,
float* restrict a_next,
float* restrict b_next
)
{
float32x4_t alphav;
alphav = vmovq_n_f32( *alpha );
float32x4_t av1;
float32x4_t av2;
float32x4_t av3;
float32x4_t av4;
float32x4_t bv1;
float32x4_t bv2;
float32x4_t bv3;
float32x4_t bv4;
dim_t k_iter = k/4;
dim_t k_left = k%4;
dim_t i;
// Vector for column 0
float32x4_t cv0;
// Vector for column 1
float32x4_t cv1;
// Vector for column 2
float32x4_t cv2;
// Vector for column 3
float32x4_t cv3;
if( rs_c == 1 )
{
// Load column 0
cv0 = vld1q_f32( c + 0*rs_c + 0*cs_c );
// Load column 1
cv1 = vld1q_f32( c + 0*rs_c + 1*cs_c );
// Load column 2
cv2 = vld1q_f32( c + 0*rs_c + 2*cs_c );
// Load column 3
cv3 = vld1q_f32( c + 0*rs_c + 3*cs_c );
}
else
{
// Load column 0
cv0 = vld1q_lane_f32( c + 0*rs_c + 0*cs_c, cv0, 0);
cv0 = vld1q_lane_f32( c + 1*rs_c + 0*cs_c, cv0, 1);
cv0 = vld1q_lane_f32( c + 2*rs_c + 0*cs_c, cv0, 2);
cv0 = vld1q_lane_f32( c + 3*rs_c + 0*cs_c, cv0, 3);
// Load column 1
cv1 = vld1q_lane_f32( c + 0*rs_c + 1*cs_c, cv1, 0);
cv1 = vld1q_lane_f32( c + 1*rs_c + 1*cs_c, cv1, 1);
cv1 = vld1q_lane_f32( c + 2*rs_c + 1*cs_c, cv1, 2);
cv1 = vld1q_lane_f32( c + 3*rs_c + 1*cs_c, cv1, 3);
// Load column 2
cv2 = vld1q_lane_f32( c + 0*rs_c + 2*cs_c, cv2, 0);
cv2 = vld1q_lane_f32( c + 1*rs_c + 2*cs_c, cv2, 1);
cv2 = vld1q_lane_f32( c + 2*rs_c + 2*cs_c, cv2, 2);
cv2 = vld1q_lane_f32( c + 3*rs_c + 2*cs_c, cv2, 3);
// Load column 3
cv3 = vld1q_lane_f32( c + 0*rs_c + 3*cs_c, cv3, 0);
cv3 = vld1q_lane_f32( c + 1*rs_c + 3*cs_c, cv3, 1);
cv3 = vld1q_lane_f32( c + 2*rs_c + 3*cs_c, cv3, 2);
cv3 = vld1q_lane_f32( c + 3*rs_c + 3*cs_c, cv3, 3);
}
// Vector for accummulating column 0
float32x4_t abv0;
// Initialize vector to 0.0
abv0 = vmovq_n_f32( 0.0 );
// Vector for accummulating column 1
float32x4_t abv1;
// Initialize vector to 0.0
abv1 = vmovq_n_f32( 0.0 );
// Vector for accummulating column 2
float32x4_t abv2;
// Initialize vector to 0.0
abv2 = vmovq_n_f32( 0.0 );
// Vector for accummulating column 3
float32x4_t abv3;
// Initialize vector to 0.0
abv3 = vmovq_n_f32( 0.0 );
for ( i = 0; i < k_iter; ++i )
{
// Begin iter 0
av1 = vld1q_f32( a );
__builtin_prefetch( a + 224 );
__builtin_prefetch( b + 224 );
bv1 = vld1q_f32( b );
abv0 = vmlaq_lane_f32( abv0, av1, vget_low_f32(bv1), 0 );
abv1 = vmlaq_lane_f32( abv1, av1, vget_low_f32(bv1), 1 );
abv2 = vmlaq_lane_f32( abv2, av1, vget_high_f32(bv1), 0 );
abv3 = vmlaq_lane_f32( abv3, av1, vget_high_f32(bv1), 1 );
av2 = vld1q_f32( a+4 );
//__builtin_prefetch( a + 116 );
//__builtin_prefetch( b + 116 );
bv2 = vld1q_f32( b+4 );
abv0 = vmlaq_lane_f32( abv0, av2, vget_low_f32(bv2), 0 );
abv1 = vmlaq_lane_f32( abv1, av2, vget_low_f32(bv2), 1 );
abv2 = vmlaq_lane_f32( abv2, av2, vget_high_f32(bv2), 0 );
abv3 = vmlaq_lane_f32( abv3, av2, vget_high_f32(bv2), 1 );
av3 = vld1q_f32( a+8 );
//__builtin_prefetch( a + 120 );
//__builtin_prefetch( b + 120 );
bv3 = vld1q_f32( b+8 );
abv0 = vmlaq_lane_f32( abv0, av3, vget_low_f32(bv3), 0 );
abv1 = vmlaq_lane_f32( abv1, av3, vget_low_f32(bv3), 1 );
abv2 = vmlaq_lane_f32( abv2, av3, vget_high_f32(bv3), 0 );
abv3 = vmlaq_lane_f32( abv3, av3, vget_high_f32(bv3), 1 );
av4 = vld1q_f32( a+12);
//__builtin_prefetch( a + 124 );
//__builtin_prefetch( b + 124 );
bv4 = vld1q_f32( b+12);
abv0 = vmlaq_lane_f32( abv0, av4, vget_low_f32(bv4), 0 );
abv1 = vmlaq_lane_f32( abv1, av4, vget_low_f32(bv4), 1 );
abv2 = vmlaq_lane_f32( abv2, av4, vget_high_f32(bv4), 0 );
abv3 = vmlaq_lane_f32( abv3, av4, vget_high_f32(bv4), 1 );
a += 16;
b += 16;
}
for ( i = 0; i < k_left; ++i )
{
av1 = vld1q_f32( a );
__builtin_prefetch( a + 112 );
__builtin_prefetch( b + 112 );
bv1 = vld1q_f32( b );
abv0 = vmlaq_lane_f32( abv0, av1, vget_low_f32(bv1), 0 );
abv1 = vmlaq_lane_f32( abv1, av1, vget_low_f32(bv1), 1 );
abv2 = vmlaq_lane_f32( abv2, av1, vget_high_f32(bv1), 0 );
abv3 = vmlaq_lane_f32( abv3, av1, vget_high_f32(bv1), 1 );
a += 4;
b += 4;
}
__builtin_prefetch( a_next );
__builtin_prefetch( b_next );
cv0 = vmulq_n_f32( cv0, *beta );
cv1 = vmulq_n_f32( cv1, *beta );
cv2 = vmulq_n_f32( cv2, *beta );
cv3 = vmulq_n_f32( cv3, *beta );
cv0 = vmlaq_f32( cv0, abv0, alphav );
cv1 = vmlaq_f32( cv1, abv1, alphav );
cv2 = vmlaq_f32( cv2, abv2, alphav );
cv3 = vmlaq_f32( cv3, abv3, alphav );
if( rs_c == 1 )
{
// Store column 0
vst1q_f32( c + 0*rs_c + 0*cs_c, cv0 );
// Store column 1
vst1q_f32( c + 0*rs_c + 1*cs_c, cv1 );
// Store column 2
vst1q_f32( c + 0*rs_c + 2*cs_c, cv2 );
// Store column 3
vst1q_f32( c + 0*rs_c + 3*cs_c, cv3 );
}
else{
// Store column 0
vst1q_lane_f32( c + 0*rs_c + 0*cs_c, cv0, 0);
vst1q_lane_f32( c + 1*rs_c + 0*cs_c, cv0, 1);
vst1q_lane_f32( c + 2*rs_c + 0*cs_c, cv0, 2);
vst1q_lane_f32( c + 3*rs_c + 0*cs_c, cv0, 3);
// Store column 1
vst1q_lane_f32( c + 0*rs_c + 1*cs_c, cv1, 0);
vst1q_lane_f32( c + 1*rs_c + 1*cs_c, cv1, 1);
vst1q_lane_f32( c + 2*rs_c + 1*cs_c, cv1, 2);
vst1q_lane_f32( c + 3*rs_c + 1*cs_c, cv1, 3);
// Store column 2
vst1q_lane_f32( c + 0*rs_c + 2*cs_c, cv2, 0);
vst1q_lane_f32( c + 1*rs_c + 2*cs_c, cv2, 1);
vst1q_lane_f32( c + 2*rs_c + 2*cs_c, cv2, 2);
vst1q_lane_f32( c + 3*rs_c + 2*cs_c, cv2, 3);
// Store column 3
vst1q_lane_f32( c + 0*rs_c + 3*cs_c, cv3, 0);
vst1q_lane_f32( c + 1*rs_c + 3*cs_c, cv3, 1);
vst1q_lane_f32( c + 2*rs_c + 3*cs_c, cv3, 2);
vst1q_lane_f32( c + 3*rs_c + 3*cs_c, cv3, 3);
}
}
void bli_dgemm_opt_4x4(
dim_t k,
double* restrict alpha,
double* restrict a,
double* restrict b,
double* restrict beta,
double* restrict c, inc_t rs_c, inc_t cs_c,
double* restrict a_next,
double* restrict b_next
)
{
//dim_t k_iter;
dim_t k_left;
//k_iter = k / 2;
k_left = k % 2;
register double a0;
register double a1;
register double a2;
register double a3;
register double A0;
register double A1;
register double A2;
register double A3;
double b0, b1, b2, b3;
double B0, B1, B2, B3;
double ab00, ab01, ab02, ab03;
double ab10, ab11, ab12, ab13;
double ab20, ab21, ab22, ab23;
double ab30, ab31, ab32, ab33;
double* restrict c00, * restrict c01, * restrict c02, * restrict c03;
double* restrict c10, * restrict c11, * restrict c12, * restrict c13;
double* restrict c20, * restrict c21, * restrict c22, * restrict c23;
double* restrict c30, * restrict c31, * restrict c32, * restrict c33;
double* restrict ap = a;
double* restrict bp = b;
double* restrict Ap = a + 4;
double* restrict Bp = b + 4;
dim_t i;
c00 = (c + 0*rs_c + 0*cs_c);
c10 = (c + 1*rs_c + 0*cs_c);
c20 = (c + 2*rs_c + 0*cs_c);
c30 = (c + 3*rs_c + 0*cs_c);
c01 = (c + 0*rs_c + 1*cs_c);
c11 = (c + 1*rs_c + 1*cs_c);
c21 = (c + 2*rs_c + 1*cs_c);
c31 = (c + 3*rs_c + 1*cs_c);
c02 = (c + 0*rs_c + 2*cs_c);
c12 = (c + 1*rs_c + 2*cs_c);
c22 = (c + 2*rs_c + 2*cs_c);
c32 = (c + 3*rs_c + 2*cs_c);
c03 = (c + 0*rs_c + 3*cs_c);
c13 = (c + 1*rs_c + 3*cs_c);
c23 = (c + 2*rs_c + 3*cs_c);
c33 = (c + 3*rs_c + 3*cs_c);
ab00 = 0.0; ab10 = 0.0; ab20 = 0.0; ab30 = 0.0;
ab01 = 0.0; ab11 = 0.0; ab21 = 0.0; ab31 = 0.0;
ab02 = 0.0; ab12 = 0.0; ab22 = 0.0; ab32 = 0.0;
ab03 = 0.0; ab13 = 0.0; ab23 = 0.0; ab33 = 0.0;
A0 = *(Ap + 0);
A1 = *(Ap + 1);
A2 = *(Ap + 2);
A3 = *(Ap + 3);
a0 = *(ap + 0);
a1 = *(ap + 1);
a2 = *(ap + 2);
B0 = *(Bp + 0);
B1 = *(Bp + 1);
B2 = *(Bp + 2);
B3 = *(Bp + 3);
b0 = *(bp + 0);
b1 = *(bp + 1);
b2 = *(bp + 2);
double *Aplast = (Ap + 4*k);
//for ( i = 0; i < k_iter; ++i ) // Unroll by factor 4.
for ( ; Ap != Aplast ; ) // Unroll by factor 4.
{
/* Prefetch */
//__asm__ ("pld\t[%0],#100\n\t" : :"r"(Ap) : );
__builtin_prefetch( ap + 112 );
__builtin_prefetch( Ap + 112 );
__builtin_prefetch( bp + 112 );
__builtin_prefetch( Bp + 112 );
// Iteration 0.
ab00 += A0 * B0;
a3 = *(ap + 3);
ab10 += A1 * B0;
b3 = *(bp + 3);
ab20 += A2 * B0;
ab30 += A3 * B0;
ab01 += A0 * B1;
ab11 += A1 * B1;
B0 = *(Bp + 8); // Prefetch.
ab21 += A2 * B1;
ab31 += A3 * B1;
ab02 += A0 * B2;
B1 = *(Bp + 9);
ab12 += A1 * B2;
ab22 += A2 * B2;
ab32 += A3 * B2;
B2 = *(Bp + 10);
ab03 += A0 * B3;
A0 = *(Ap + 8); // Prefetch.
ab13 += A1 * B3;
A1 = *(Ap + 9); // Prefetch.
ab23 += A2 * B3;
ab33 += A3 * B3;
A2 = *(Ap + 10); // Prefetch.
// Iteration 1.
//__asm__ ("pld\t[%0],#200\n\t" : :"r"(Ap) : );
ab00 += a0 * b0;
ab10 += a1 * b0;
A3 = *(Ap + 11); // Prefetch.
ab20 += a2 * b0;
ab30 += a3 * b0;
B3 = *(Bp + 11);
ab01 += a0 * b1;
b0 = *(bp + 8);
ab11 += a1 * b1;
ab21 += a2 * b1;
ab31 += a3 * b1;
b1 = *(bp + 9);
ab02 += a0 * b2;
ab12 += a1 * b2;
ab22 += a2 * b2;
ab32 += a3 * b2;
b2 = *(bp + 10);
ab03 += a0 * b3;
a0 = *(ap + 8);
ab13 += a1 * b3;
a1 = *(ap + 9);
ab23 += a2 * b3;
a2 = *(ap + 10);
ab33 += a3 * b3;
//a3 = *(ap + 11);
ap += 8;
Ap += 8;
bp += 8;
Bp += 8;
}
for ( i = 0; i < k_left; ++i )
{
a0 = *(a + 0);
a1 = *(a + 1);
a2 = *(a + 2);
a3 = *(a + 3);
b0 = *(b + 0);
b1 = *(b + 1);
b2 = *(b + 2);
b3 = *(b + 3);
ab00 += a0 * b0;
ab10 += a1 * b0;
ab20 += a2 * b0;
ab30 += a3 * b0;
ab01 += a0 * b1;
ab11 += a1 * b1;
ab21 += a2 * b1;
ab31 += a3 * b1;
ab02 += a0 * b2;
ab12 += a1 * b2;
ab22 += a2 * b2;
ab32 += a3 * b2;
ab03 += a0 * b3;
ab13 += a1 * b3;
ab23 += a2 * b3;
ab33 += a3 * b3;
a += 4;
b += 2;
}
*c00 = *c00 * *beta;
*c10 = *c10 * *beta;
*c20 = *c20 * *beta;
*c30 = *c30 * *beta;
*c01 = *c01 * *beta;
*c11 = *c11 * *beta;
*c21 = *c21 * *beta;
*c31 = *c31 * *beta;
*c02 = *c02 * *beta;
*c12 = *c12 * *beta;
*c22 = *c22 * *beta;
*c32 = *c32 * *beta;
*c03 = *c03 * *beta;
*c13 = *c13 * *beta;
*c23 = *c23 * *beta;
*c33 = *c33 * *beta;
*c00 += ab00 * *alpha;
*c10 += ab10 * *alpha;
*c20 += ab20 * *alpha;
*c30 += ab30 * *alpha;
*c01 += ab01 * *alpha;
*c11 += ab11 * *alpha;
*c21 += ab21 * *alpha;
*c31 += ab31 * *alpha;
*c02 += ab02 * *alpha;
*c12 += ab12 * *alpha;
*c22 += ab22 * *alpha;
*c32 += ab32 * *alpha;
*c03 += ab03 * *alpha;
*c13 += ab13 * *alpha;
*c23 += ab23 * *alpha;
*c33 += ab33 * *alpha;
}
void bli_cgemm_opt_4x4(
dim_t k,
scomplex* restrict alpha,
scomplex* restrict a,
scomplex* restrict b,
scomplex* restrict beta,
scomplex* restrict c, inc_t rs_c, inc_t cs_c,
scomplex* restrict a_next,
scomplex* restrict b_next
)
{
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
}
void bli_zgemm_opt_4x4(
dim_t k,
dcomplex* restrict alpha,
dcomplex* restrict a,
dcomplex* restrict b,
dcomplex* restrict beta,
dcomplex* restrict c, inc_t rs_c, inc_t cs_c,
dcomplex* restrict a_next,
dcomplex* restrict b_next
)
{
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
}

View File

@@ -0,0 +1,52 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2013, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#include "arm_neon.h"
#undef GENTPROT
#define GENTPROT( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname)( \
dim_t k, \
ctype* restrict alpha, \
ctype* restrict a, \
ctype* restrict b, \
ctype* restrict beta, \
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
ctype* restrict a_next, \
ctype* restrict b_next \
);
INSERT_GENTPROT_BASIC( gemm_opt_4x4 )