Merge pull request #3 from figual/master

New ARM armv7a kernels and Assembly file consideration in Makefile
2026-05-11 09:39:59 +00:00 · 2014-02-21 09:04:13 -06:00
parent b29e1c2b27 d1813c9dee
commit fc04b5eb69
10 changed files with 2819 additions and 7 deletions
--- a/20
+++ b/20
@@ -317,18 +317,24 @@ CFLAGS_KERNELS := $(CFLAGS_KERNELS) $(VERS_DEF)
 # Convert source file paths to object file paths by replacing the base source
 # directories with the base object directories, and also replacing the source
 # file suffix (eg: '.c') with '.o'.
-MK_BLIS_CONFIG_OBJS         := $(patsubst $(FRAME_PATH)/%.c, $(BASE_OBJ_FRAME_PATH)/%.o, \
+MK_BLIS_FRAME_OBJS         := $(patsubst $(FRAME_PATH)/%.c, $(BASE_OBJ_FRAME_PATH)/%.o, \
                                          $(filter %.c, $(MK_FRAME_SRC)))
-MK_BLIS_CONFIG_NOOPT_OBJS   := $(patsubst $(FRAME_PATH)/%.c, $(BASE_OBJ_FRAME_PATH)/%.o, \
+MK_BLIS_FRAME_NOOPT_OBJS   := $(patsubst $(FRAME_PATH)/%.c, $(BASE_OBJ_FRAME_PATH)/%.o, \
                                          $(filter %.c, $(MK_FRAME_NOOPT_SRC)))
-MK_BLIS_CONFIG_KERNELS_OBJS := $(patsubst $(FRAME_PATH)/%.c, $(BASE_OBJ_FRAME_PATH)/%.o, \
+MK_BLIS_FRAME_KERNELS_OBJS := $(patsubst $(FRAME_PATH)/%.c, $(BASE_OBJ_FRAME_PATH)/%.o, \
                                          $(filter %.c, $(MK_FRAME_KERNELS_SRC)))

-MK_BLIS_FRAME_OBJS          := $(patsubst $(CONFIG_PATH)/%.c, $(BASE_OBJ_CONFIG_PATH)/%.o, \
+MK_BLIS_CONFIG_OBJS          := $(patsubst $(CONFIG_PATH)/%.S, $(BASE_OBJ_CONFIG_PATH)/%.o, \
+                                          $(filter %.S, $(MK_CONFIG_SRC)))
+MK_BLIS_CONFIG_OBJS          += $(patsubst $(CONFIG_PATH)/%.c, $(BASE_OBJ_CONFIG_PATH)/%.o, \
                                          $(filter %.c, $(MK_CONFIG_SRC)))
-MK_BLIS_FRAME_NOOPT_OBJS    := $(patsubst $(CONFIG_PATH)/%.c, $(BASE_OBJ_CONFIG_PATH)/%.o, \
+MK_BLIS_CONFIG_NOOPT_OBJS    := $(patsubst $(CONFIG_PATH)/%.S, $(BASE_OBJ_CONFIG_PATH)/%.o, \
+                                          $(filter %.S, $(MK_CONFIG_NOOPT_SRC)))
+MK_BLIS_CONFIG_NOOPT_OBJS    += $(patsubst $(CONFIG_PATH)/%.c, $(BASE_OBJ_CONFIG_PATH)/%.o, \
                                          $(filter %.c, $(MK_CONFIG_NOOPT_SRC)))
-MK_BLIS_FRAME_KERNELS_OBJS  := $(patsubst $(CONFIG_PATH)/%.c, $(BASE_OBJ_CONFIG_PATH)/%.o, \
+MK_BLIS_CONFIG_KERNELS_OBJS  := $(patsubst $(CONFIG_PATH)/%.S, $(BASE_OBJ_CONFIG_PATH)/%.o, \
+                                          $(filter %.S, $(MK_CONFIG_KERNELS_SRC)))
+MK_BLIS_CONFIG_KERNELS_OBJS  += $(patsubst $(CONFIG_PATH)/%.c, $(BASE_OBJ_CONFIG_PATH)/%.o, \
                                          $(filter %.c, $(MK_CONFIG_KERNELS_SRC)))

 # Combine all of the object files into some readily-accessible variables.
@@ -427,7 +433,7 @@ else
 	@$(CC) $(call get_cflags_for_obj,$@) -c $< -o $@
 endif

-$(BASE_OBJ_CONFIG_PATH)/%.o: $(CONFIG_PATH)/%.c $(MK_HEADER_FILES) $(MAKE_DEFS_MK_PATH)
+$(BASE_OBJ_CONFIG_PATH)/%.o: $(CONFIG_PATH)/%.[cS] $(MK_HEADER_FILES) $(MAKE_DEFS_MK_PATH)
 ifeq ($(BLIS_ENABLE_VERBOSE_MAKE_OUTPUT),yes)
 	$(CC) $(call get_cflags_for_obj,$@) -c $< -o $@
 else
--- a/config/armv7a/bli_config.h
+++ b/config/armv7a/bli_config.h
@@ -0,0 +1,169 @@
+/*
+
+   BLIS    
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef BLIS_CONFIG_H
+#define BLIS_CONFIG_H
+
+
+// -- OPERATING SYSTEM ---------------------------------------------------------
+
+
+
+// -- INTEGER PROPERTIES -------------------------------------------------------
+
+// The bit size of the integer type used to track values such as dimensions,
+// strides, diagonal offsets. A value of 32 results in BLIS using 32-bit signed
+// integers while 64 results in 64-bit integers. Any other value results in use
+// of the C99 type "long int". Note that this ONLY affects integers used
+// internally within BLIS as well as those exposed in the native BLAS-like BLIS
+// interface.
+#define BLIS_INT_TYPE_SIZE               32
+
+
+
+// -- FLOATING-POINT PROPERTIES ------------------------------------------------
+
+// Define the number of floating-point types supported, and the size of the
+// largest type.
+#define BLIS_NUM_FP_TYPES                4
+#define BLIS_MAX_TYPE_SIZE               sizeof(dcomplex)
+
+// Enable use of built-in C99 "float complex" and "double complex" types and
+// associated overloaded operations and functions? Disabling results in
+// scomplex and dcomplex being defined in terms of simple structs.
+//#define BLIS_ENABLE_C99_COMPLEX
+
+
+
+// -- MULTITHREADING -----------------------------------------------------------
+
+// The maximum number of BLIS threads that will run concurrently.
+#define BLIS_MAX_NUM_THREADS             1
+
+
+
+// -- MEMORY ALLOCATION --------------------------------------------------------
+
+// -- Contiguous (static) memory allocator --
+
+// The number of MC x KC, KC x NC, and MC x NC blocks to reserve in the
+// contiguous memory pools.
+#define BLIS_NUM_MC_X_KC_BLOCKS          BLIS_MAX_NUM_THREADS
+#define BLIS_NUM_KC_X_NC_BLOCKS          BLIS_MAX_NUM_THREADS
+#define BLIS_NUM_MC_X_NC_BLOCKS          0
+
+// The maximum preload byte offset is used to pad the end of the contiguous
+// memory pools so that the micro-kernel, when computing with the end of the
+// last block, can exceed the bounds of the usable portion of the memory
+// region without causing a segmentation fault.
+#define BLIS_MAX_PRELOAD_BYTE_OFFSET     128
+
+// -- Memory alignment --
+
+// It is sometimes useful to define the various memory alignments in terms
+// of some other characteristics of the system, such as the cache line size
+// and the page size.
+#define BLIS_CACHE_LINE_SIZE             32
+#define BLIS_PAGE_SIZE                   4096
+
+// Alignment size needed by the instruction set for aligned SIMD/vector
+// instructions.
+#define BLIS_SIMD_ALIGN_SIZE             32
+
+// Alignment size used to align local stack buffers within macro-kernel
+// functions.
+#define BLIS_STACK_BUF_ALIGN_SIZE        BLIS_SIMD_ALIGN_SIZE
+
+// Alignment size used when allocating memory dynamically from the operating
+// system (eg: posix_memalign()). To disable heap alignment and just use
+// malloc() instead, set this to 1.
+#define BLIS_HEAP_ADDR_ALIGN_SIZE        BLIS_SIMD_ALIGN_SIZE
+
+// Alignment size used when sizing leading dimensions of dynamically
+// allocated memory.
+#define BLIS_HEAP_STRIDE_ALIGN_SIZE      BLIS_CACHE_LINE_SIZE
+
+// Alignment size used when allocating entire blocks of contiguous memory
+// from the contiguous memory allocator.
+#define BLIS_CONTIG_ADDR_ALIGN_SIZE      BLIS_PAGE_SIZE
+
+// Alignment size used when sizing strides (eg: of packed micro-panels)
+// within a block of contiguous memory.
+#define BLIS_CONTIG_STRIDE_ALIGN_SIZE    BLIS_SIMD_ALIGN_SIZE
+
+
+
+// -- MIXED DATATYPE SUPPORT ---------------------------------------------------
+
+// Basic (homogeneous) datatype support always enabled.
+
+// Enable mixed domain operations?
+//#define BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
+
+// Enable extra mixed precision operations?
+//#define BLIS_ENABLE_MIXED_PRECISION_SUPPORT
+
+
+
+// -- MISCELLANEOUS OPTIONS ----------------------------------------------------
+
+// Stay initialized after auto-initialization, unless and until the user
+// explicitly calls bli_finalize().
+#define BLIS_ENABLE_STAY_AUTO_INITIALIZED
+
+
+
+// -- BLAS-to-BLIS COMPATIBILITY LAYER -----------------------------------------
+
+// Enable the BLAS compatibility layer?
+#define BLIS_ENABLE_BLAS2BLIS
+
+// The bit size of the integer type used to track values such as dimensions and
+// leading dimensions (ie: column strides) within the BLAS compatibility layer.
+// A value of 32 results in the compatibility layer using 32-bit signed integers
+// while 64 results in 64-bit integers. Any other value results in use of the
+// C99 type "long int". Note that this ONLY affects integers used within the
+// BLAS compatibility layer.
+#define BLIS_BLAS2BLIS_INT_TYPE_SIZE     32
+
+// Fortran-77 name-mangling macros.
+#define PASTEF770(name)                        name ## _
+#define PASTEF77(ch1,name)       ch1        ## name ## _
+#define PASTEF772(ch1,ch2,name)  ch1 ## ch2 ## name ## _
+
+
+
+
+#endif
+
--- a/config/armv7a/bli_kernel.h
+++ b/config/armv7a/bli_kernel.h
@@ -0,0 +1,348 @@
+/*
+
+   BLIS    
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef BLIS_KERNEL_H
+#define BLIS_KERNEL_H
+
+
+// -- LEVEL-3 MICRO-KERNEL CONSTANTS -------------------------------------------
+
+// -- Default cache blocksizes --
+
+//
+// Constraints:
+//
+// (1) MC must be a multiple of:
+//     (a) MR (for zero-padding purposes)
+//     (b) NR (for zero-padding purposes when MR and NR are "swapped")
+// (2) NC must be a multiple of
+//     (a) NR (for zero-padding purposes)
+//     (b) MR (for zero-padding purposes when MR and NR are "swapped")
+// (3) KC must be a multiple of
+//     (a) MR and
+//     (b) NR (for triangular operations such as trmm and trsm).
+//
+
+#define BLIS_DEFAULT_MC_S              432
+#define BLIS_DEFAULT_KC_S              352
+#define BLIS_DEFAULT_NC_S              4096
+
+#define BLIS_DEFAULT_MC_D              192
+#define BLIS_DEFAULT_KC_D              256
+#define BLIS_DEFAULT_NC_D              4096
+
+#define BLIS_DEFAULT_MC_C              64
+#define BLIS_DEFAULT_KC_C              128
+#define BLIS_DEFAULT_NC_C              4096
+
+#define BLIS_DEFAULT_MC_Z              64
+#define BLIS_DEFAULT_KC_Z              128
+#define BLIS_DEFAULT_NC_Z              4096
+
+// -- Cache blocksize extensions (for optimizing edge cases) --
+
+// NOTE: These cache blocksize "extensions" have the same constraints as
+// the corresponding default blocksizes above. When these values are
+// non-zero, blocksizes used at edge cases are extended (enlarged) if
+// such an extension would encompass the remaining portion of the
+// matrix dimension.
+
+#define BLIS_EXTEND_MC_S               0 //(BLIS_DEFAULT_MC_S/4)
+#define BLIS_EXTEND_KC_S               0 //(BLIS_DEFAULT_KC_S/4)
+#define BLIS_EXTEND_NC_S               0 //(BLIS_DEFAULT_NC_S/4)
+
+#define BLIS_EXTEND_MC_D               0 //(BLIS_DEFAULT_MC_D/4)
+#define BLIS_EXTEND_KC_D               0 //(BLIS_DEFAULT_KC_D/4)
+#define BLIS_EXTEND_NC_D               0 //(BLIS_DEFAULT_NC_D/4)
+
+#define BLIS_EXTEND_MC_C               0 //(BLIS_DEFAULT_MC_C/4)
+#define BLIS_EXTEND_KC_C               0 //(BLIS_DEFAULT_KC_C/4)
+#define BLIS_EXTEND_NC_C               0 //(BLIS_DEFAULT_NC_C/4)
+
+#define BLIS_EXTEND_MC_Z               0 //(BLIS_DEFAULT_MC_Z/4)
+#define BLIS_EXTEND_KC_Z               0 //(BLIS_DEFAULT_KC_Z/4)
+#define BLIS_EXTEND_NC_Z               0 //(BLIS_DEFAULT_NC_Z/4)
+
+// -- Default register blocksizes for micro-kernel --
+
+// NOTE: When using the reference configuration, these register blocksizes
+// in the m and n dimensions should all be equal to the size expected by
+// the reference micro-kernel(s).
+
+#define BLIS_DEFAULT_MR_S              4
+#define BLIS_DEFAULT_NR_S              4
+
+#define BLIS_DEFAULT_MR_D              4
+#define BLIS_DEFAULT_NR_D              4
+
+#define BLIS_DEFAULT_MR_C              2
+#define BLIS_DEFAULT_NR_C              2
+
+#define BLIS_DEFAULT_MR_Z              2
+#define BLIS_DEFAULT_NR_Z              2
+
+// NOTE: If the micro-kernel, which is typically unrolled to a factor
+// of f, handles leftover edge cases (ie: when k % f > 0) then these
+// register blocksizes in the k dimension can be defined to 1.
+
+#define BLIS_DEFAULT_KR_S              1
+#define BLIS_DEFAULT_KR_D              1
+#define BLIS_DEFAULT_KR_C              1
+#define BLIS_DEFAULT_KR_Z              1
+
+// -- Register blocksize extensions (for packed micro-panels) --
+
+// NOTE: These register blocksize "extensions" determine whether the
+// leading dimensions used within the packed micro-panels are equal to
+// or greater than their corresponding register blocksizes above.
+
+#define BLIS_EXTEND_MR_S               0
+#define BLIS_EXTEND_NR_S               0
+
+#define BLIS_EXTEND_MR_D               0
+#define BLIS_EXTEND_NR_D               0
+
+#define BLIS_EXTEND_MR_C               0
+#define BLIS_EXTEND_NR_C               0
+
+#define BLIS_EXTEND_MR_Z               0
+#define BLIS_EXTEND_NR_Z               0
+
+// Register blocksize extensions in the k dimension are not used.
+
+#define BLIS_EXTEND_KR_S               0
+#define BLIS_EXTEND_KR_D               0
+#define BLIS_EXTEND_KR_C               0
+#define BLIS_EXTEND_KR_Z               0
+
+// -- Default incremental packing blocksizes (n dimension) --
+
+// NOTE: These incremental packing blocksizes (for the n dimension) are only
+// used by certain blocked variants. But when the *are* used, they MUST be
+// be an integer multiple of NR!
+
+#define BLIS_DEFAULT_NI_FAC            16
+#define BLIS_DEFAULT_NI_S              (BLIS_DEFAULT_NI_FAC * BLIS_DEFAULT_NR_S)
+#define BLIS_DEFAULT_NI_D              (BLIS_DEFAULT_NI_FAC * BLIS_DEFAULT_NR_D)
+#define BLIS_DEFAULT_NI_C              (BLIS_DEFAULT_NI_FAC * BLIS_DEFAULT_NR_C)
+#define BLIS_DEFAULT_NI_Z              (BLIS_DEFAULT_NI_FAC * BLIS_DEFAULT_NR_Z)
+
+
+
+// -- LEVEL-2 KERNEL CONSTANTS -------------------------------------------------
+
+// NOTE: These values determine high-level cache blocking for level-2
+// operations ONLY. So, if gemv is performed with a 2000x2000 matrix A and
+// MC = NC = 1000, then a total of four unblocked (or unblocked fused)
+// gemv subproblems are called. The blocked algorithms are only useful in
+// that they provide the opportunity for packing vectors. (Matrices can also
+// be packed here, but this tends to be much too expensive in practice to
+// actually employ.)
+
+#define BLIS_DEFAULT_L2_MC_S           1000
+#define BLIS_DEFAULT_L2_NC_S           1000
+
+#define BLIS_DEFAULT_L2_MC_D           1000
+#define BLIS_DEFAULT_L2_NC_D           1000
+
+#define BLIS_DEFAULT_L2_MC_C           1000
+#define BLIS_DEFAULT_L2_NC_C           1000
+
+#define BLIS_DEFAULT_L2_MC_Z           1000
+#define BLIS_DEFAULT_L2_NC_Z           1000
+
+
+
+// -- LEVEL-1F KERNEL CONSTANTS ------------------------------------------------
+
+// -- Default fusing factors for level-1f operations --
+
+// NOTE: Default fusing factors are not used by the reference implementations
+// of level-1f operations. They are here only for use when these operations
+// are optimized.
+
+#define BLIS_DEFAULT_FUSE_FAC_S        8
+#define BLIS_DEFAULT_FUSE_FAC_D        4
+#define BLIS_DEFAULT_FUSE_FAC_C        4
+#define BLIS_DEFAULT_FUSE_FAC_Z        2
+
+#define BLIS_AXPYF_FUSE_FAC_S          BLIS_DEFAULT_FUSE_FAC_S
+#define BLIS_AXPYF_FUSE_FAC_D          BLIS_DEFAULT_FUSE_FAC_D
+#define BLIS_AXPYF_FUSE_FAC_C          BLIS_DEFAULT_FUSE_FAC_C
+#define BLIS_AXPYF_FUSE_FAC_Z          BLIS_DEFAULT_FUSE_FAC_Z
+
+#define BLIS_DOTXF_FUSE_FAC_S          BLIS_DEFAULT_FUSE_FAC_S
+#define BLIS_DOTXF_FUSE_FAC_D          BLIS_DEFAULT_FUSE_FAC_D
+#define BLIS_DOTXF_FUSE_FAC_C          BLIS_DEFAULT_FUSE_FAC_C
+#define BLIS_DOTXF_FUSE_FAC_Z          BLIS_DEFAULT_FUSE_FAC_Z
+
+#define BLIS_DOTXAXPYF_FUSE_FAC_S      BLIS_DEFAULT_FUSE_FAC_S
+#define BLIS_DOTXAXPYF_FUSE_FAC_D      BLIS_DEFAULT_FUSE_FAC_D
+#define BLIS_DOTXAXPYF_FUSE_FAC_C      BLIS_DEFAULT_FUSE_FAC_C
+#define BLIS_DOTXAXPYF_FUSE_FAC_Z      BLIS_DEFAULT_FUSE_FAC_Z
+
+
+
+// -- LEVEL-1V KERNEL CONSTANTS ------------------------------------------------
+
+// -- Default register blocksizes for vectors --
+
+// NOTE: Register blocksizes for vectors are used when packing
+// non-contiguous vectors. Similar to that of KR, they can
+// typically be set to 1.
+
+#define BLIS_DEFAULT_VR_S              1
+#define BLIS_DEFAULT_VR_D              1
+#define BLIS_DEFAULT_VR_C              1
+#define BLIS_DEFAULT_VR_Z              1
+
+
+
+// -- LEVEL-3 KERNEL DEFINITIONS -----------------------------------------------
+
+// -- gemm --
+
+#include "bli_gemm_opt_4x4.h"
+#define GEMM_UKERNEL         gemm_opt_4x4
+
+// -- trsm-related --
+
+#define GEMMTRSM_L_UKERNEL   gemmtrsm_l_ref_mxn
+#define GEMMTRSM_U_UKERNEL   gemmtrsm_u_ref_mxn
+
+#define TRSM_L_UKERNEL       trsm_l_ref_mxn
+#define TRSM_U_UKERNEL       trsm_u_ref_mxn
+
+
+
+// -- LEVEL-1M KERNEL DEFINITIONS ----------------------------------------------
+
+// -- packm --
+
+#define PACKM_2XK_KERNEL     packm_ref_2xk
+#define PACKM_4XK_KERNEL     packm_ref_4xk
+#define PACKM_6XK_KERNEL     packm_ref_6xk
+#define PACKM_8XK_KERNEL     packm_ref_8xk
+#define PACKM_10XK_KERNEL    packm_ref_10xk
+#define PACKM_12XK_KERNEL    packm_ref_12xk
+#define PACKM_14XK_KERNEL    packm_ref_14xk
+#define PACKM_16XK_KERNEL    packm_ref_16xk
+
+// -- unpackm --
+
+#define UNPACKM_2XK_KERNEL   unpackm_ref_2xk
+#define UNPACKM_4XK_KERNEL   unpackm_ref_4xk
+#define UNPACKM_6XK_KERNEL   unpackm_ref_6xk
+#define UNPACKM_8XK_KERNEL   unpackm_ref_8xk
+#define UNPACKM_10XK_KERNEL  unpackm_ref_10xk
+#define UNPACKM_12XK_KERNEL  unpackm_ref_12xk
+#define UNPACKM_14XK_KERNEL  unpackm_ref_14xk
+#define UNPACKM_16XK_KERNEL  unpackm_ref_16xk
+
+
+
+// -- LEVEL-1F KERNEL DEFINITIONS ----------------------------------------------
+
+// -- axpy2v --
+
+#define AXPY2V_KERNEL        axpy2v_unb_var1
+
+// -- dotaxpyv --
+
+#define DOTAXPYV_KERNEL      dotaxpyv_unb_var1
+
+// -- axpyf --
+
+#define AXPYF_KERNEL         axpyf_unb_var1
+
+// -- dotxf --
+
+#define DOTXF_KERNEL         dotxf_unb_var1
+
+// -- dotxaxpyf --
+
+#define DOTXAXPYF_KERNEL     dotxaxpyf_unb_var1
+
+
+
+// -- LEVEL-1V KERNEL DEFINITIONS ----------------------------------------------
+
+// -- addv --
+
+#define ADDV_KERNEL          addv_unb_var1
+
+// -- axpyv --
+
+#define AXPYV_KERNEL         axpyv_unb_var1
+
+// -- copyv --
+
+#define COPYV_KERNEL         copyv_unb_var1
+
+// -- dotv --
+
+#define DOTV_KERNEL          dotv_unb_var1
+
+// -- dotxv --
+
+#define DOTXV_KERNEL         dotxv_unb_var1
+
+// -- invertv --
+
+#define INVERTV_KERNEL       invertv_unb_var1
+
+// -- scal2v --
+
+#define SCAL2V_KERNEL        scal2v_unb_var1
+
+// -- scalv --
+
+#define SCALV_KERNEL         scalv_unb_var1
+
+// -- setv --
+
+#define SETV_KERNEL          setv_unb_var1
+
+// -- subv --
+
+#define SUBV_KERNEL          subv_unb_var1
+
+// -- swapv --
+
+#define SWAPV_KERNEL         swapv_unb_var1
+
+
+
+#endif
+
--- a/config/armv7a/kernels/3/bli_cgemm_kernel_2x2.S
+++ b/config/armv7a/kernels/3/bli_cgemm_kernel_2x2.S
@@ -0,0 +1,502 @@
+
+#define REALNAME bli_cgemm_kernel_2x2
+
+#define STACKSIZE 256
+
+#define	K		r0
+#define	PTR_ALPHA	r1
+#define	OLD_A		r2
+#define	OLD_B		r3
+#define PTR_BETA	[fp, #0 ]
+#define OLD_C		[fp, #4 ]
+#define OLD_RSC		[fp, #8 ]
+#define OLD_CSC		[fp, #12 ]
+#define AUX		[fp, #16 ]
+
+/******************************************************
+* [fp, #-128] - [fp, #-64] is reserved
+* for store and restore of floating point
+* register
+*******************************************************/
+
+#define L	r2
+
+#define	AO	r5
+#define	BO	r6
+
+#define	CO1	r7
+#define	CO2	r8
+
+
+#define A_PRE	96
+#define B_PRE	96
+#define C_PRE	0
+
+/**************************************************************************************
+* Macro definitions
+**************************************************************************************/
+
+#define FMAC_BR	fnmacs
+#define FMAC_BI	fmacs
+
+#define NN 1
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT) 
+
+	#define	FADD_R	fsubs
+	#define	FADD_I	fadds
+
+	#define	FMAC_R1	fnmacs
+	#define	FMAC_R2	fnmacs
+	#define	FMAC_I1	fmacs
+	#define	FMAC_I2	fnmacs
+
+#elif defined(CN) || defined(CT)
+
+	#define	FADD_R	fadds
+	#define	FADD_I	fsubs
+
+	#define	FMAC_R1	fmacs
+	#define	FMAC_R2	fmacs
+	#define	FMAC_I1	fnmacs
+	#define	FMAC_I2	fmacs
+
+#elif defined(NC) || defined(TC)
+
+	#define	FADD_R	fadds
+	#define	FADD_I	fsubs
+
+	#define	FMAC_R1	fmacs
+	#define	FMAC_R2	fnmacs
+	#define	FMAC_I1	fmacs
+	#define	FMAC_I2	fmacs
+
+#else
+
+	#define	FADD_R  fsubs
+	#define	FADD_I	fadds
+
+	#define	FMAC_R1	fnmacs
+	#define	FMAC_R2	fmacs
+	#define	FMAC_I1	fnmacs
+	#define	FMAC_I2	fnmacs
+
+#endif
+
+
+
+.macro INIT2x2
+
+	vsub.f32		s16 , s16 , s16
+	vmov.f32		s17, s16
+	vmov.f32		s18, s16
+	vmov.f32		s19, s16
+	vmov.f32		s20, s16
+	vmov.f32		s21, s16
+	vmov.f32		s22, s16
+	vmov.f32		s23, s16
+	vmov.f32		s24, s16
+	vmov.f32		s25, s16
+	vmov.f32		s26, s16
+	vmov.f32		s27, s16
+	vmov.f32		s28, s16
+	vmov.f32		s29, s16
+	vmov.f32		s30, s16
+	vmov.f32		s31, s16
+
+.endm
+
+.macro KERNEL2x2_I
+	pld	[ AO , #A_PRE ]
+	pld	[ BO , #B_PRE ]
+	flds	s0 , [ AO ]
+	flds	s1 , [ AO, #4 ]
+	flds	s8 , [ BO ]
+	flds	s9 , [ BO, #4 ]
+
+	fmuls	s16  , s0,  s8
+	flds	s2 , [ AO, #8 ]
+	fmuls	s24  , s1,  s9
+	flds	s3 , [ AO, #12 ]
+	fmuls	s17  , s0,  s9
+	flds	s10, [ BO, #8 ]
+	fmuls	s25  , s1,  s8
+
+	flds	s11, [ BO, #12 ]
+	fmuls	s18  , s2,  s8
+	add	BO , BO, #16
+	fmuls	s26  , s3,  s9
+	add	AO , AO, #16
+	fmuls	s19  , s2,  s9
+	pld	[ BO , #B_PRE ]
+	fmuls	s27  , s3,  s8
+
+	pld	[ AO , #A_PRE ]
+	fmuls	s20  , s0,  s10
+	flds	s4 , [ AO, #0 ]
+	fmuls	s28  , s1,  s11
+	flds	s5 , [ AO, #4 ]
+	fmuls	s21  , s0,  s11
+	flds	s12, [ BO ]
+	fmuls	s29  , s1,  s10
+
+	flds	s13, [ BO, #4 ]
+	fmuls	s22  , s2,  s10
+	flds	s6 , [ AO, #8 ]
+	fmuls	s30  , s3,  s11
+	flds	s7 , [ AO, #12 ]
+	fmuls	s23  , s2,  s11
+	flds	s14, [ BO, #8 ]
+	fmuls	s31  , s3,  s10
+	flds	s15, [ BO, #12 ]
+
+	add	BO , BO, #16
+	add	AO , AO, #16
+.endm
+
+
+
+.macro KERNEL2x2_M1
+	pld	[ AO , #A_PRE ]
+
+	fmacs	s16  , s0,  s8
+	pld	[ BO , #B_PRE ]
+	fmacs	s24  , s1,  s9
+	flds	s4 , [ AO, #0 ]
+	fmacs	s17  , s0,  s9
+	flds	s5 , [ AO, #4 ]
+	fmacs	s25  , s1,  s8
+
+	flds	s12, [ BO ]
+	fmacs	s18  , s2,  s8
+	flds	s13, [ BO, #4 ]
+	fmacs	s26  , s3,  s9
+	flds	s6 , [ AO, #8 ]
+	fmacs	s19  , s2,  s9
+	flds	s7 , [ AO, #12 ]
+	fmacs	s27  , s3,  s8
+
+	fmacs	s20  , s0,  s10
+	flds	s14, [ BO, #8 ]
+	fmacs	s28  , s1,  s11
+	fmacs	s21  , s0,  s11
+	flds	s15, [ BO, #12 ]
+	fmacs	s29  , s1,  s10
+
+	fmacs	s22  , s2,  s10
+	add	BO , BO, #16
+	fmacs	s30  , s3,  s11
+	fmacs	s23  , s2,  s11
+	add	AO , AO, #16
+	fmacs	s31  , s3,  s10
+
+.endm
+
+.macro KERNEL2x2_M2
+
+	fmacs	s16  , s4,  s12
+	fmacs	s24  , s5,  s13
+	flds	s0 , [ AO, #0 ]
+	fmacs	s17  , s4,  s13
+	flds	s1 , [ AO, #4 ]
+	fmacs	s25  , s5,  s12
+
+	fmacs	s18  , s6,  s12
+	flds	s8 , [ BO ]
+	fmacs	s26  , s7,  s13
+	flds	s9 , [ BO, #4 ]
+	fmacs	s19  , s6,  s13
+	fmacs	s27  , s7,  s12
+
+	flds	s2 , [ AO, #8 ]
+	fmacs	s20  , s4,  s14
+	flds	s3 , [ AO, #12 ]
+	fmacs	s28  , s5,  s15
+	fmacs	s21  , s4,  s15
+	flds	s10, [ BO, #8 ]
+	fmacs	s29  , s5,  s14
+
+	flds	s11, [ BO, #12 ]
+	fmacs	s22  , s6,  s14
+	fmacs	s30  , s7,  s15
+	add	BO , BO, #16
+	fmacs	s23  , s6,  s15
+	add	AO , AO, #16
+	fmacs	s31  , s7,  s14
+
+.endm
+
+
+.macro KERNEL2x2_E
+
+	fmacs	s16  , s4,  s12
+	fmacs	s24  , s5,  s13
+	fmacs	s17  , s4,  s13
+	fmacs	s25  , s5,  s12
+
+	fmacs	s18  , s6,  s12
+	fmacs	s26  , s7,  s13
+	fmacs	s19  , s6,  s13
+	fmacs	s27  , s7,  s12
+
+	fmacs	s20  , s4,  s14
+	fmacs	s28  , s5,  s15
+	fmacs	s21  , s4,  s15
+	fmacs	s29  , s5,  s14
+
+	fmacs	s22  , s6,  s14
+	fmacs	s30  , s7,  s15
+	fmacs	s23  , s6,  s15
+	fmacs	s31  , s7,  s14
+
+.endm
+
+.macro KERNEL2x2_SUB
+
+	flds	s0 , [ AO ]
+	flds	s1 , [ AO, #4 ]
+	flds	s8 , [ BO ]
+	flds	s9 , [ BO, #4 ]
+
+	fmacs	s16  , s0,  s8
+	flds	s2 , [ AO, #8 ]
+	fmacs	s24  , s1,  s9
+	flds	s3 , [ AO, #12 ]
+	fmacs	s17  , s0,  s9
+	flds	s10, [ BO, #8 ]
+	fmacs	s25  , s1,  s8
+
+	flds	s11, [ BO, #12 ]
+	fmacs	s18  , s2,  s8
+	fmacs	s26  , s3,  s9
+	fmacs	s19  , s2,  s9
+	fmacs	s27  , s3,  s8
+
+	fmacs	s20  , s0,  s10
+	fmacs	s28  , s1,  s11
+	fmacs	s21  , s0,  s11
+	fmacs	s29  , s1,  s10
+
+	fmacs	s22  , s2,  s10
+	add	BO , BO, #16
+	fmacs	s30  , s3,  s11
+	fmacs	s23  , s2,  s11
+	add	AO , AO, #16
+	fmacs	s31  , s3,  s10
+
+.endm
+
+
+
+
+.macro SAVE2x2
+
+        ldr     r3, OLD_RSC                             // Row stride size
+        lsl     r3, r3, #3                              // multiply with size of complex float
+
+        flds    s0, [ PTR_ALPHA ]                       // load real part of alpha
+        flds    s1, [ PTR_ALPHA, #4 ]                   // load imag part of alpha
+        ldr     r4, PTR_BETA
+        flds    s2, [ r4 ]                              // load real part of beta
+        flds    s3, [ r4, #4  ]                         // load imag part of beta
+
+	// Add/Sub the real and the imag parts
+	FADD_R	s16, s24 , s16
+	FADD_I  s17, s25 , s17
+	FADD_R	s18, s26 , s18
+	FADD_I  s19, s27 , s19
+	FADD_R	s20, s28 , s20
+	FADD_I  s21, s29 , s21
+	FADD_R	s22, s30 , s22
+	FADD_I  s23, s31 , s23
+
+	mov	r4, CO1					// save pointer
+	fldmias CO1, { s4 - s5 }			// read real and imag part from C
+	add	CO1, CO1, r3
+	
+	mov	r2, CO2					// save pointer
+	fldmias CO2, { s8 - s9 }			// read real and imag part from C
+	add	CO2, CO2, r3
+
+	fmuls	s24, s4, s2				// multiply Beta-real with C-real
+	fmuls	s25, s5, s2				// multiply Beta-real with C-imag
+	fmuls	s28, s8, s2				// multiply Beta-real with C-real
+	fmuls	s29, s9, s2				// multiply Beta-real with C-imag
+
+	FMAC_BR	s24, s3, s5				// multiply beta-imag with C-imag and add
+	FMAC_BI	s25, s3, s4				// multiply beta-imag with C-real and add
+	FMAC_BR	s28, s3, s9				// multiply beta-imag with C-imag and add
+	FMAC_BI	s29, s3, s8				// multiply beta-imag with C-real and add
+
+	FMAC_R1 s24 , s0 , s16
+	FMAC_I1 s25 , s0 , s17
+	FMAC_R2 s24 , s1 , s17
+	FMAC_I2	s25 , s1 , s16
+
+	FMAC_R1 s28 , s0 , s20
+	FMAC_I1 s29 , s0 , s21
+	FMAC_R2 s28 , s1 , s21
+	FMAC_I2	s29 , s1 , s20
+
+	fldmias CO1, { s4 - s5 }			// read real and imag part from C
+	fldmias CO2, { s8 - s9 }			// read real and imag part from C
+
+	fmuls	s26, s4, s2				// multiply Beta-real with C-real
+	fmuls	s27, s5, s2				// multiply Beta-real with C-imag
+	fmuls	s30, s8, s2				// multiply Beta-real with C-real
+	fmuls	s31, s9, s2				// multiply Beta-real with C-imag
+
+	FMAC_BR	s26, s3, s5				// multiply beta-imag with C-imag and add
+	FMAC_BI	s27, s3, s4				// multiply beta-imag with C-real and add
+	FMAC_BR	s30, s3, s9				// multiply beta-imag with C-imag and add
+	FMAC_BI	s31, s3, s8				// multiply beta-imag with C-real and add
+
+	FMAC_R1 s26 , s0 , s18
+	FMAC_I1 s27 , s0 , s19
+	FMAC_R2 s26 , s1 , s19
+	FMAC_I2	s27 , s1 , s18
+
+	FMAC_R1 s30, s0 , s22
+	FMAC_I1 s31, s0 , s23
+	FMAC_R2 s30, s1 , s23
+	FMAC_I2	s31, s1 , s22
+
+	mov	CO1, r4					// restore pointer
+	mov	CO2, r2					// restore pointer
+	fstmias CO1, { s24 - s25 }
+	fstmias CO2, { s28 - s29 }
+	add	CO1, CO1, r3
+	add	CO2, CO2, r3
+	fstmias CO1, { s26 - s27 }
+	fstmias CO2, { s30 - s31 }
+
+
+.endm
+
+
+
+/**************************************************************************************
+* End of macro definitions
+**************************************************************************************/
+
+        .arm             	
+        .global REALNAME 	
+        .func   REALNAME 	
+
+REALNAME:
+
+	push	{r4 - r9, fp}					// save register
+	add	fp, sp, #28					// add number of saved register multiplied by size of int
+	sub	sp, sp, #STACKSIZE				// reserve stack
+
+	mov	AO, OLD_A					// pointer matrix A
+	mov	BO, OLD_B					// pointer matrix B
+
+	sub	r3, fp, #128
+	vstm	r3, { s8 - s31} 				// store floating point registers
+
+	ldr	r2, OLD_C					// pointer matrix C
+	ldr	r3, OLD_CSC					// Col stride size of C
+	lsl	r3, r3, #3					// multiply with size of complex float
+
+	mov	CO1, r2						// first line of C
+	add	CO2, CO1, r3					// second line of C
+
+	pld	[ CO1, #C_PRE ]					// prefetch the lines of C
+	pld	[ CO2, #C_PRE ]					// prefetch the lines of C
+
+cgemm_kernel_L2_M2_20:
+
+	asrs	L , K, #3					// L = K / 8
+	cmp	L , #2
+	blt	cgemm_kernel_L2_M2_32
+
+	KERNEL2x2_I
+	KERNEL2x2_M2
+	KERNEL2x2_M1
+	KERNEL2x2_M2
+
+	KERNEL2x2_M1
+	KERNEL2x2_M2
+	KERNEL2x2_M1
+	KERNEL2x2_M2
+
+	subs	L, L, #2
+	ble	cgemm_kernel_L2_M2_22a
+	.align 5
+
+cgemm_kernel_L2_M2_22:
+
+	KERNEL2x2_M1
+	KERNEL2x2_M2
+	KERNEL2x2_M1
+	KERNEL2x2_M2
+
+	KERNEL2x2_M1
+	KERNEL2x2_M2
+	KERNEL2x2_M1
+	KERNEL2x2_M2
+
+	subs	L, L, #1
+	bgt	cgemm_kernel_L2_M2_22
+
+cgemm_kernel_L2_M2_22a:
+
+	KERNEL2x2_M1
+	KERNEL2x2_M2
+	KERNEL2x2_M1
+	KERNEL2x2_M2
+
+	KERNEL2x2_M1
+	KERNEL2x2_M2
+	KERNEL2x2_M1
+	KERNEL2x2_E
+
+	b	 cgemm_kernel_L2_M2_44
+
+cgemm_kernel_L2_M2_32:
+
+	tst	L, #1
+	ble	cgemm_kernel_L2_M2_40
+
+	KERNEL2x2_I
+	KERNEL2x2_M2
+	KERNEL2x2_M1
+	KERNEL2x2_M2
+
+	KERNEL2x2_M1
+	KERNEL2x2_M2
+	KERNEL2x2_M1
+	KERNEL2x2_E
+
+	b	 cgemm_kernel_L2_M2_44
+
+cgemm_kernel_L2_M2_40:
+
+	INIT2x2
+
+cgemm_kernel_L2_M2_44:
+
+	ands	L , K, #7					// L = K % 8
+	ble	cgemm_kernel_L2_M2_100
+
+cgemm_kernel_L2_M2_46:
+
+	KERNEL2x2_SUB
+
+	subs	L, L, #1
+	bne	cgemm_kernel_L2_M2_46
+	
+cgemm_kernel_L2_M2_100:
+
+	SAVE2x2
+
+cgemm_kernel_L999:
+
+	sub	r3, fp, #128
+	vldm	r3, { s8 - s31}					// restore floating point registers
+
+	sub	sp, fp, #28
+	pop	{r4 - r9, fp}
+	bx	lr
+
--- a/config/armv7a/kernels/3/bli_dgemm_kernel_4x4.S
+++ b/config/armv7a/kernels/3/bli_dgemm_kernel_4x4.S
@@ -0,0 +1,503 @@
+
+#define REALNAME bli_dgemm_kernel_4x4
+
+#define STACKSIZE 256
+
+#define	K		r0
+#define	PTR_ALPHA	r1
+#define	OLD_A		r2
+#define	OLD_B		r3
+#define PTR_BETA	[fp, #0 ]
+#define OLD_C		[fp, #4 ]
+#define OLD_RSC		[fp, #8 ]
+#define OLD_CSC		[fp, #12 ]
+#define AUX		[fp, #16 ]
+
+/******************************************************
+* [fp, #-128] - [fp, #-64] is reserved
+* for store and restore of floating point
+* register
+*******************************************************/
+
+#define L	r2
+
+#define	AO	r5
+#define	BO	r6
+
+#define	CO1	r7
+#define	CO2	r8
+#define	CO3	r9
+#define	CO4	r12
+
+
+#define A_PRE	96
+#define B_PRE	96
+#define C_PRE	0
+
+/**************************************************************************************
+* Macro definitions
+**************************************************************************************/
+
+.macro INIT4x4
+
+	vsub.f64		d16 , d16 , d16
+	vmov.f64		d17, d16
+	vmov.f64		d18, d16
+	vmov.f64		d19, d16
+	vmov.f64		d20, d16
+	vmov.f64		d21, d16
+	vmov.f64		d22, d16
+	vmov.f64		d23, d16
+	vmov.f64		d24, d16
+	vmov.f64		d25, d16
+	vmov.f64		d26, d16
+	vmov.f64		d27, d16
+	vmov.f64		d28, d16
+	vmov.f64		d29, d16
+	vmov.f64		d30, d16
+	vmov.f64		d31, d16
+
+.endm
+
+.macro KERNEL4x4_I
+	pld	[ BO , #B_PRE ]
+	fldd	d8 , [ BO ]
+	fldd	d0 , [ AO ]
+	pld	[ AO , #A_PRE ]
+
+	fldd	d1 , [ AO, #8 ]
+	fmuld	d16  , d0,  d8
+	fldd	d2 , [ AO, #16 ]
+	fmuld	d17  , d1,  d8
+	fldd	d3 , [ AO, #24 ]
+	fmuld	d18  , d2,  d8
+	fldd	d9 , [ BO, #8 ]
+	fmuld	d19  , d3,  d8
+
+	fldd	d10, [ BO, #16 ]
+	fmuld	d20  , d0,  d9
+	fldd	d11, [ BO, #24 ]
+	fmuld	d21  , d1,  d9
+	add	BO , BO, #32
+	add	AO , AO, #32
+	fmuld	d22  , d2,  d9
+
+	pld	[ BO , #B_PRE ]
+	fldd	d12, [ BO ]
+	fmuld	d23  , d3,  d9
+
+	pld	[ AO , #A_PRE ]
+	fldd	d4 , [ AO, #0 ]
+	fmuld	d24  , d0,  d10
+	fldd	d5 , [ AO, #8 ]
+	fmuld	d25  , d1,  d10
+	fldd	d6 , [ AO, #16 ]
+	fmuld	d26  , d2,  d10
+	fldd	d7 , [ AO, #24 ]
+	fmuld	d27  , d3,  d10
+
+	fldd	d13, [ BO, #8 ]
+	fmuld	d28  , d0,  d11
+	fldd	d14, [ BO, #16 ]
+	fmuld	d29  , d1,  d11
+	fldd	d15, [ BO, #24 ]
+	fmuld	d30  , d2,  d11
+	fmuld	d31  , d3,  d11
+
+.endm
+
+.macro KERNEL4x4_M2
+
+	fmacd	d16  , d4,  d12
+	pld	[ AO , #A_PRE+32 ]
+	fmacd	d17  , d5,  d12
+	fldd	d0 , [ AO , #32 ]
+	fmacd	d18  , d6,  d12
+	pld	[ BO , #B_PRE+32 ]
+	fmacd	d19  , d7,  d12
+
+	fldd	d8 , [ BO , #32 ]
+	fmacd	d20  , d4,  d13
+	fldd	d1 , [ AO, #40 ]
+	fmacd	d21  , d5,  d13
+	fldd	d2 , [ AO, #48 ]
+	fmacd	d22  , d6,  d13
+	fldd	d3 , [ AO, #56 ]
+	fmacd	d23  , d7,  d13
+
+	fmacd	d24  , d4,  d14
+	fmacd	d25  , d5,  d14
+	fldd	d9 , [ BO, #40 ]
+	fmacd	d26  , d6,  d14
+	fldd	d10, [ BO, #48 ]
+	fmacd	d27  , d7,  d14
+
+	fldd	d11, [ BO, #56 ]
+	fmacd	d28  , d4,  d15
+	fmacd	d29  , d5,  d15
+	add	AO , AO, #64
+	fmacd	d30  , d6,  d15
+	add	BO , BO, #64
+	fmacd	d31  , d7,  d15
+
+.endm
+
+.macro KERNEL4x4_M1
+
+	fmacd	d16  , d0,  d8
+	pld	[ AO , #A_PRE ]
+	fmacd	d17  , d1,  d8
+	fldd	d4 , [ AO ]
+	fmacd	d18  , d2,  d8
+	pld	[ BO , #B_PRE ]
+	fmacd	d19  , d3,  d8
+
+	fldd	d12, [ BO ]
+	fmacd	d20  , d0,  d9
+	fldd	d5 , [ AO, #8 ]
+	fmacd	d21  , d1,  d9
+	fldd	d6 , [ AO, #16 ]
+	fmacd	d22  , d2,  d9
+	fldd	d7 , [ AO, #24 ]
+	fmacd	d23  , d3,  d9
+
+	fmacd	d24  , d0,  d10
+	fmacd	d25  , d1,  d10
+	fldd	d13, [ BO, #8 ]
+	fmacd	d26  , d2,  d10
+	fldd	d14, [ BO, #16 ]
+	fmacd	d27  , d3,  d10
+
+	fldd	d15, [ BO, #24 ]
+	fmacd	d28  , d0,  d11
+	fmacd	d29  , d1,  d11
+	fmacd	d30  , d2,  d11
+	fmacd	d31  , d3,  d11
+
+.endm
+
+.macro KERNEL4x4_E
+
+	fmacd	d16  , d4,  d12
+	fmacd	d17  , d5,  d12
+	add	BO , BO, #32
+	fmacd	d18  , d6,  d12
+	add	AO , AO, #32
+	fmacd	d19  , d7,  d12
+
+	fmacd	d20  , d4,  d13
+	fmacd	d21  , d5,  d13
+	fmacd	d22  , d6,  d13
+	fmacd	d23  , d7,  d13
+
+	fmacd	d24  , d4,  d14
+	fmacd	d25  , d5,  d14
+	fmacd	d26  , d6,  d14
+	fmacd	d27  , d7,  d14
+
+	fmacd	d28  , d4,  d15
+	fmacd	d29  , d5,  d15
+	fmacd	d30  , d6,  d15
+	fmacd	d31  , d7,  d15
+
+.endm
+
+.macro KERNEL4x4_SUB
+
+	fldd	d8 , [ BO ]
+	pld	[ BO , #B_PRE ]
+
+	fldd	d0 , [ AO ]
+	pld	[ AO , #A_PRE ]
+	fldd	d1 , [ AO, #8 ]
+
+	fmacd	d16  , d0,  d8
+	fldd	d2 , [ AO, #16 ]
+	fmacd	d17  , d1,  d8
+	fldd	d3 , [ AO, #24 ]
+	fmacd	d18  , d2,  d8
+	fldd	d9 , [ BO, #8 ]
+	fmacd	d19  , d3,  d8
+
+	fldd	d10, [ BO, #16 ]
+	fmacd	d20  , d0,  d9
+	fldd	d11, [ BO, #24 ]
+	fmacd	d21  , d1,  d9
+	fmacd	d22  , d2,  d9
+	fmacd	d23  , d3,  d9
+
+	fmacd	d24  , d0,  d10
+	fmacd	d25  , d1,  d10
+	fmacd	d26  , d2,  d10
+	fmacd	d27  , d3,  d10
+
+	fmacd	d28  , d0,  d11
+	fmacd	d29  , d1,  d11
+	add	AO , AO, #32
+	fmacd	d30  , d2,  d11
+	add	BO , BO, #32
+	fmacd	d31  , d3,  d11
+
+.endm
+
+.macro SAVE4x4
+
+	ldr	r3, OLD_RSC				// Row stride size
+	lsl	r3, r3, #3				// multiply with size of double
+
+	fldd	d0, [ PTR_ALPHA	]			// load alpha
+	ldr	r4, PTR_BETA
+	fldd	d1, [ r4 ]				// load beta
+
+//-----------------------------------------------------------
+	mov	r2, CO1					// save pointer
+	mov	r4, CO2					// save pointer
+	fldd	d8, [ CO1 ]				// load value from C
+	fldd	d12, [ CO2 ]				// load value from C
+	fmuld	d8, d8, d1				// multiply with beta
+	add	CO1, CO1, r3				// compute next pointer
+	fmacd	d8, d0, d16				// multiply sum with alpha and add to value of C	
+	add	CO2, CO2, r3				// compute next pointer
+
+	fldd	d9, [ CO1 ]				// load value from C
+	fldd	d13, [ CO2 ]				// load value from C
+	fmuld	d9, d9, d1				// multiply with beta
+	add	CO1, CO1, r3				// compute next pointer
+	fmacd	d9, d0, d17				// multiply sum with alpha and add to value of C	
+	add	CO2, CO2, r3				// compute next pointer
+
+	fldd	d10, [ CO1 ]				// load value from C
+	fldd	d14, [ CO2 ]				// load value from C
+	fmuld	d10, d10, d1				// multiply with beta
+	add	CO1, CO1, r3				// compute next pointer
+	fmacd	d10, d0, d18				// multiply sum with alpha and add to value of C	
+	add	CO2, CO2, r3				// compute next pointer
+
+	fldd	d11, [ CO1 ]				// load value from C
+	fldd	d15, [ CO2 ]				// load value from C
+	fmuld	d11, d11, d1				// multiply with beta
+	mov	CO1, r2					// restore pointer
+	fmacd	d11, d0, d19				// multiply sum with alpha and add to value of C	
+	mov	CO2, r4					// restore pointer
+	
+	fstd	d8, [ CO1 ]				// store value in C
+	add	CO1 , CO1, r3				// compute next pointer
+	fstd	d9, [ CO1 ]				// store value in C
+	add	CO1 , CO1, r3				// compute next pointer
+	fstd	d10, [ CO1 ]				// store value in C
+	add	CO1 , CO1, r3				// compute next pointer
+	fstd	d11, [ CO1 ]				// store value in C
+
+//-----------------------------------------------------------
+	mov	r2, CO3					// save pointer
+	fldd	d8, [ CO3 ]				// load value from C
+	fmuld	d12, d12, d1				// multiply with beta
+	add	CO3, CO3, r3				// compute next pointer
+	fmacd	d12, d0, d20				// multiply sum with alpha and add to value of C	
+
+	fldd	d9, [ CO3 ]				// load value from C
+	fmuld	d13, d13, d1				// multiply with beta
+	add	CO3, CO3, r3				// compute next pointer
+	fmacd	d13, d0, d21				// multiply sum with alpha and add to value of C	
+
+	fldd	d10, [ CO3 ]				// load value from C
+	fmuld	d14, d14, d1				// multiply with beta
+	add	CO3, CO3, r3				// compute next pointer
+	fmacd	d14, d0, d22				// multiply sum with alpha and add to value of C	
+
+	fldd	d11, [ CO3 ]				// load value from C
+	fmuld	d15, d15, d1				// multiply with beta
+	mov	CO3, r2					// restore pointer
+	fmacd	d15, d0, d23				// multiply sum with alpha and add to value of C	
+	
+	fstd	d12, [ CO2 ]				// store value in C
+	add	CO2 , CO2, r3				// compute next pointer
+	fstd	d13, [ CO2 ]				// store value in C
+	add	CO2 , CO2, r3				// compute next pointer
+	fstd	d14, [ CO2 ]				// store value in C
+	add	CO2 , CO2, r3				// compute next pointer
+	fstd	d15, [ CO2 ]				// store value in C
+
+//-----------------------------------------------------------
+	mov	r4, CO4					// save pointer
+	fldd	d12, [ CO4 ]				// load value from C
+	fmuld	d8, d8, d1				// multiply with beta
+	add	CO4, CO4, r3				// compute next pointer
+	fmacd	d8, d0, d24				// multiply sum with alpha and add to value of C	
+
+	fldd	d13, [ CO4 ]				// load value from C
+	fmuld	d9, d9, d1				// multiply with beta
+	add	CO4, CO4, r3				// compute next pointer
+	fmacd	d9, d0, d25				// multiply sum with alpha and add to value of C	
+
+	fldd	d14, [ CO4 ]				// load value from C
+	fmuld	d10, d10, d1				// multiply with beta
+	add	CO4, CO4, r3				// compute next pointer
+	fmacd	d10, d0, d26				// multiply sum with alpha and add to value of C	
+
+	fldd	d15, [ CO4 ]				// load value from C
+	fmuld	d11, d11, d1				// multiply with beta
+	mov	CO4, r4					// restore pointer
+	fmacd	d11, d0, d27				// multiply sum with alpha and add to value of C	
+	
+
+//-----------------------------------------------------------
+	fstd	d8, [ CO3 ]				// store value in C
+	fmuld	d12, d12, d1				// multiply with beta
+	add	CO3 , CO3, r3				// compute next pointer
+	fmacd	d12, d0, d28				// multiply sum with alpha and add to value of C	
+
+	fstd	d9, [ CO3 ]				// store value in C
+	fmuld	d13, d13, d1				// multiply with beta
+	add	CO3 , CO3, r3				// compute next pointer
+	fmacd	d13, d0, d29				// multiply sum with alpha and add to value of C	
+
+	fstd	d10, [ CO3 ]				// store value in C
+	fmuld	d14, d14, d1				// multiply with beta
+	add	CO3 , CO3, r3				// compute next pointer
+	fmacd	d14, d0, d30				// multiply sum with alpha and add to value of C	
+
+	fstd	d11, [ CO3 ]				// store value in C
+	fmuld	d15, d15, d1				// multiply with beta
+	fstd	d12, [ CO4 ]				// store value in C
+	fmacd	d15, d0, d31				// multiply sum with alpha and add to value of C	
+	
+	add	CO4 , CO4, r3				// compute next pointer
+	fstd	d13, [ CO4 ]				// store value in C
+	add	CO4 , CO4, r3				// compute next pointer
+	fstd	d14, [ CO4 ]				// store value in C
+	add	CO4 , CO4, r3				// compute next pointer
+	fstd	d15, [ CO4 ]				// store value in C
+
+.endm
+
+
+/**************************************************************************************
+* End of macro definitions
+**************************************************************************************/
+
+        .arm             	
+        .global REALNAME 	
+        .func   REALNAME 	
+
+REALNAME:
+
+	push	{r4 - r9, fp}					// save register
+	add	fp, sp, #28					// add number of saved register multiplied by size of int
+	sub	sp, sp, #STACKSIZE				// reserve stack
+
+	mov	AO, OLD_A					// pointer matrix A
+	mov	BO, OLD_B					// pointer matrix B
+
+	sub	r3, fp, #128
+	vstm	r3, { d8 - d15} 				// store floating point registers
+
+	ldr	r2, OLD_C					// pointer matrix C
+	ldr	r3, OLD_CSC					// Col stride size of C
+	lsl	r3, r3, #3					// multiply with size of double
+
+	mov	CO1, r2						// first line of C
+	add	CO2, CO1, r3					// second line of C
+	add	CO3, CO2, r3					// third line of C
+	add	CO4, CO3, r3					// fourth line of C
+
+	pld	[ CO1, #C_PRE ]					// prefetch the lines of C
+	pld	[ CO2, #C_PRE ]					// prefetch the lines of C
+	pld	[ CO3, #C_PRE ]					// prefetch the lines of C
+	pld	[ CO3, #C_PRE ]					// prefetch the lines of C
+
+dgemm_kernel_L4_M4_20:
+
+	asrs	L , K, #3					// L = K / 8
+	cmp	L , #2
+	blt	dgemm_kernel_L4_M4_32
+
+	KERNEL4x4_I
+	KERNEL4x4_M2
+	KERNEL4x4_M1
+	KERNEL4x4_M2
+
+	KERNEL4x4_M1
+	KERNEL4x4_M2
+	KERNEL4x4_M1
+	KERNEL4x4_M2
+
+	subs	L, L, #2
+	ble	dgemm_kernel_L4_M4_22a
+	.align 5
+
+dgemm_kernel_L4_M4_22:
+
+	KERNEL4x4_M1
+	KERNEL4x4_M2
+	KERNEL4x4_M1
+	KERNEL4x4_M2
+
+	KERNEL4x4_M1
+	KERNEL4x4_M2
+	KERNEL4x4_M1
+	KERNEL4x4_M2
+
+	subs	L, L, #1
+	bgt	dgemm_kernel_L4_M4_22
+
+dgemm_kernel_L4_M4_22a:
+
+	KERNEL4x4_M1
+	KERNEL4x4_M2
+	KERNEL4x4_M1
+	KERNEL4x4_M2
+
+	KERNEL4x4_M1
+	KERNEL4x4_M2
+	KERNEL4x4_M1
+	KERNEL4x4_E
+
+	b	 dgemm_kernel_L4_M4_44
+
+dgemm_kernel_L4_M4_32:
+
+	tst	L, #1
+	ble	dgemm_kernel_L4_M4_40
+
+	KERNEL4x4_I
+	KERNEL4x4_M2
+	KERNEL4x4_M1
+	KERNEL4x4_M2
+
+	KERNEL4x4_M1
+	KERNEL4x4_M2
+	KERNEL4x4_M1
+	KERNEL4x4_E
+
+	b	 dgemm_kernel_L4_M4_44
+
+dgemm_kernel_L4_M4_40:
+
+	INIT4x4
+
+dgemm_kernel_L4_M4_44:
+
+	ands	L , K, #7					// L = K % 8
+	ble	dgemm_kernel_L4_M4_100
+
+dgemm_kernel_L4_M4_46:
+
+	KERNEL4x4_SUB
+
+	subs	L, L, #1
+	bne	dgemm_kernel_L4_M4_46
+	
+dgemm_kernel_L4_M4_100:
+
+	SAVE4x4
+
+dgemm_kernel_L999:
+
+	sub	r3, fp, #128
+	vldm	r3, { d8 - d15}					// restore floating point registers
+
+	sub	sp, fp, #28
+	pop	{r4 - r9, fp}
+	bx	lr
+
--- a/config/armv7a/kernels/3/bli_gemm_opt_4x4.c
+++ b/config/armv7a/kernels/3/bli_gemm_opt_4x4.c
@@ -0,0 +1,134 @@
+/*
+
+   BLIS    
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2012, The University of Texas
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+extern void bli_sgemm_kernel_4x4(dim_t              k,
+                        float* alpha,
+                        float*   restrict a,
+                        float*   restrict b,
+                        float* beta,
+                        float*   restrict c, inc_t rs_c, inc_t cs_c,
+                        auxinfo_t*         data
+                      );
+
+
+void bli_sgemm_opt_4x4(
+                        dim_t              k,
+                        float*    restrict alpha,
+                        float*    restrict a,
+                        float*    restrict b,
+                        float*    restrict beta,
+                        float*    restrict c, inc_t rs_c, inc_t cs_c,
+                        auxinfo_t*         data
+                      )
+{
+
+	bli_sgemm_kernel_4x4(k, alpha, a, b, beta, c, rs_c, cs_c, data);
+
+}
+
+extern void bli_dgemm_kernel_4x4(dim_t              k,
+                        double* alpha,
+                        double*   restrict a,
+                        double*   restrict b,
+                        double* beta,
+                        double*   restrict c, inc_t rs_c, inc_t cs_c,
+                        auxinfo_t*         data
+                      );
+
+
+void bli_dgemm_opt_4x4(
+                        dim_t              k,
+                        double*   restrict alpha,
+                        double*   restrict a,
+                        double*   restrict b,
+                        double*   restrict beta,
+                        double*   restrict c, inc_t rs_c, inc_t cs_c,
+                        auxinfo_t*         data
+                      )
+{
+	bli_dgemm_kernel_4x4(k, alpha, a, b, beta, c, rs_c, cs_c, data);
+}
+
+extern void bli_cgemm_kernel_2x2(dim_t              k,
+                        scomplex* alpha,
+                        scomplex*   restrict a,
+                        scomplex*   restrict b,
+                        scomplex* beta,
+                        scomplex*   restrict c, inc_t rs_c, inc_t cs_c,
+                        auxinfo_t*         data
+                      );
+
+
+
+
+void bli_cgemm_opt_4x4(
+                        dim_t              k,
+                        scomplex* restrict alpha,
+                        scomplex* restrict a,
+                        scomplex* restrict b,
+                        scomplex* restrict beta,
+                        scomplex* restrict c, inc_t rs_c, inc_t cs_c,
+                        auxinfo_t*         data
+                      )
+{
+
+	bli_cgemm_kernel_2x2(k, alpha, a, b, beta, c, rs_c, cs_c, data);
+}
+
+extern void bli_zgemm_kernel_2x2(dim_t              k,
+                        dcomplex* alpha,
+                        dcomplex*   restrict a,
+                        dcomplex*   restrict b,
+                        dcomplex* beta,
+                        dcomplex*   restrict c, inc_t rs_c, inc_t cs_c,
+                        auxinfo_t*         data
+                      );
+
+
+void bli_zgemm_opt_4x4(
+                        dim_t              k,
+                        dcomplex* restrict alpha,
+                        dcomplex* restrict a,
+                        dcomplex* restrict b,
+                        dcomplex* restrict beta,
+                        dcomplex* restrict c, inc_t rs_c, inc_t cs_c,
+                        auxinfo_t*         data
+                      )
+{
+
+	bli_zgemm_kernel_2x2(k, alpha, a, b, beta, c, rs_c, cs_c, data);
+}
+
--- a/config/armv7a/kernels/3/bli_gemm_opt_4x4.h
+++ b/config/armv7a/kernels/3/bli_gemm_opt_4x4.h
@@ -0,0 +1,53 @@
+/*
+
+   BLIS    
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+// #include "arm_neon.h"
+
+
+#undef  GENTPROT
+#define GENTPROT( ctype, ch, varname ) \
+\
+void PASTEMAC(ch,varname)( \
+                           dim_t           k, \
+                           ctype* restrict alpha, \
+                           ctype* restrict a, \
+                           ctype* restrict b, \
+                           ctype* restrict beta, \
+                           ctype* restrict c, inc_t rs_c, inc_t cs_c, \
+                           auxinfo_t*      data  \
+                         );
+
+INSERT_GENTPROT_BASIC( gemm_opt_4x4 )
+
--- a/config/armv7a/kernels/3/bli_sgemm_kernel_4x4.S
+++ b/config/armv7a/kernels/3/bli_sgemm_kernel_4x4.S
@@ -0,0 +1,483 @@
+
+#define REALNAME bli_sgemm_kernel_4x4
+
+#define STACKSIZE 256
+
+#define	K		r0
+#define	PTR_ALPHA	r1
+#define	OLD_A		r2
+#define	OLD_B		r3
+#define PTR_BETA	[fp, #0 ]
+#define OLD_C		[fp, #4 ]
+#define OLD_RSC		[fp, #8 ]
+#define OLD_CSC		[fp, #12 ]
+#define AUX		[fp, #16 ]
+
+/******************************************************
+* [fp, #-128] - [fp, #-64] is reserved
+* for store and restore of floating point
+* register
+*******************************************************/
+
+#define L	r2
+
+#define	AO	r5
+#define	BO	r6
+
+#define	CO1	r7
+#define	CO2	r8
+#define	CO3	r9
+#define	CO4	r12
+
+
+#define A_PRE	96
+#define B_PRE	96
+#define C_PRE	0
+
+/**************************************************************************************
+* Macro definitions
+**************************************************************************************/
+
+.macro INIT4x4
+
+	vsub.f32		s16 , s16 , s16
+	vmov.f32		s17, s16
+	vmov.f32		s18, s16
+	vmov.f32		s19, s16
+	vmov.f32		s20, s16
+	vmov.f32		s21, s16
+	vmov.f32		s22, s16
+	vmov.f32		s23, s16
+	vmov.f32		s24, s16
+	vmov.f32		s25, s16
+	vmov.f32		s26, s16
+	vmov.f32		s27, s16
+	vmov.f32		s28, s16
+	vmov.f32		s29, s16
+	vmov.f32		s30, s16
+	vmov.f32		s31, s16
+
+.endm
+
+.macro KERNEL4x4_I
+
+	pld	[ AO , #A_PRE ]
+	fldmias AO!, { s0 - s1 }
+	pld	[ BO , #B_PRE ]
+	fldmias BO!, { s8 - s9 }
+
+	fmuls	s16  , s0,  s8
+	fldmias AO!, { s2 - s3 }
+	fmuls	s17  , s1,  s8
+	fmuls	s18  , s2,  s8
+	fldmias BO!, { s10 - s11 }
+	fmuls	s19  , s3,  s8
+
+	fmuls	s20  , s0,  s9
+	fldmias AO!, { s4 - s5 }
+	fmuls	s21  , s1,  s9
+	fmuls	s22  , s2,  s9
+	fldmias AO!, { s6 - s7 }
+	fmuls	s23  , s3,  s9
+
+	fmuls	s24  , s0,  s10
+	fldmias BO!, { s12 - s13 }
+	fmuls	s25  , s1,  s10
+	fmuls	s26  , s2,  s10
+	fldmias BO!, { s14 - s15 }
+	fmuls	s27  , s3,  s10
+
+	fmuls	s28  , s0,  s11
+	fmuls	s29  , s1,  s11
+	fmuls	s30  , s2,  s11
+	fmuls	s31  , s3,  s11
+
+.endm
+
+
+.macro KERNEL4x4_M2
+
+	pld	[ AO , #A_PRE ]
+	fmacs	s16  , s4,  s12
+	fmacs	s17  , s5,  s12
+	fldmias AO!, { s0 - s3 }
+	fmacs	s18  , s6,  s12
+	pld	[ BO , #B_PRE ]
+	fmacs	s19  , s7,  s12
+
+	fmacs	s20  , s4,  s13
+	fldmias BO!, { s8 - s11 }
+	fmacs	s21  , s5,  s13
+	fmacs	s22  , s6,  s13
+	//fldmias AO!, { s2 - s3 }
+	fmacs	s23  , s7,  s13
+
+	fmacs	s24  , s4,  s14
+	//fldmias BO!, { s10 - s11 }
+	fmacs	s25  , s5,  s14
+	fmacs	s26  , s6,  s14
+	fmacs	s27  , s7,  s14
+
+	fmacs	s28  , s4,  s15
+	fmacs	s29  , s5,  s15
+	fmacs	s30  , s6,  s15
+	fmacs	s31  , s7,  s15
+
+.endm
+
+
+.macro KERNEL4x4_M1
+
+	fmacs	s16  , s0,  s8
+	fldmias AO!, { s4 - s7 }
+	fmacs	s17  , s1,  s8
+	fmacs	s18  , s2,  s8
+	fldmias BO!, { s12 - s15 }
+	//fldmias AO!, { s6 - s7 }
+	fmacs	s19  , s3,  s8
+
+	fmacs	s20  , s0,  s9
+	fmacs	s21  , s1,  s9
+	fmacs	s22  , s2,  s9
+	//fldmias BO!, { s14 - s15 }
+	fmacs	s23  , s3,  s9
+
+	fmacs	s24  , s0,  s10
+	fmacs	s25  , s1,  s10
+	fmacs	s26  , s2,  s10
+	fmacs	s27  , s3,  s10
+
+	fmacs	s28  , s0,  s11
+	fmacs	s29  , s1,  s11
+	fmacs	s30  , s2,  s11
+	fmacs	s31  , s3,  s11
+
+.endm
+
+
+
+.macro KERNEL4x4_E
+
+	fmacs	s16  , s4,  s12
+	fmacs	s17  , s5,  s12
+	fmacs	s18  , s6,  s12
+	fmacs	s19  , s7,  s12
+
+	fmacs	s20  , s4,  s13
+	fmacs	s21  , s5,  s13
+	fmacs	s22  , s6,  s13
+	fmacs	s23  , s7,  s13
+
+	fmacs	s24  , s4,  s14
+	fmacs	s25  , s5,  s14
+	fmacs	s26  , s6,  s14
+	fmacs	s27  , s7,  s14
+
+	fmacs	s28  , s4,  s15
+	fmacs	s29  , s5,  s15
+	fmacs	s30  , s6,  s15
+	fmacs	s31  , s7,  s15
+
+.endm
+
+
+
+
+.macro KERNEL4x4_SUB
+
+	flds	s8 , [ BO ]
+
+	flds	s0 , [ AO ]
+	flds	s1 , [ AO, #4 ]
+
+	fmacs	s16  , s0,  s8
+	flds	s2 , [ AO, #8 ]
+	fmacs	s17  , s1,  s8
+	flds	s3 , [ AO, #12 ]
+	fmacs	s18  , s2,  s8
+	flds	s9 , [ BO, #4 ]
+	fmacs	s19  , s3,  s8
+
+	flds	s10, [ BO, #8 ]
+	fmacs	s20  , s0,  s9
+	flds	s11, [ BO, #12 ]
+	fmacs	s21  , s1,  s9
+	fmacs	s22  , s2,  s9
+	fmacs	s23  , s3,  s9
+
+	fmacs	s24  , s0,  s10
+	fmacs	s25  , s1,  s10
+	fmacs	s26  , s2,  s10
+	fmacs	s27  , s3,  s10
+
+	fmacs	s28  , s0,  s11
+	fmacs	s29  , s1,  s11
+	add	AO , AO, #16
+	fmacs	s30  , s2,  s11
+	add	BO , BO, #16
+	fmacs	s31  , s3,  s11
+
+.endm
+
+
+.macro SAVE4x4
+
+	ldr	r3, OLD_RSC				// Row stride size
+	lsl	r3, r3, #2				// multiply with size of float
+
+	flds	s0, [ PTR_ALPHA	]			// load alpha
+	ldr	r4, PTR_BETA
+	flds	s1, [ r4 ]				// load beta
+
+//-----------------------------------------------------------
+	mov	r2, CO1					// save pointer
+	mov	r4, CO2					// save pointer
+	flds	s8, [ CO1 ]				// load value from C
+	flds	s12, [ CO2 ]				// load value from C
+	fmuls	s8, s8, s1				// multiply with beta
+	add	CO1, CO1, r3				// compute next pointer
+	fmacs	s8, s0, s16				// multiply sum with alpha and add to value of C	
+	add	CO2, CO2, r3				// compute next pointer
+
+	flds	s9, [ CO1 ]				// load value from C
+	flds	s13, [ CO2 ]				// load value from C
+	fmuls	s9, s9, s1				// multiply with beta
+	add	CO1, CO1, r3				// compute next pointer
+	fmacs	s9, s0, s17				// multiply sum with alpha and add to value of C	
+	add	CO2, CO2, r3				// compute next pointer
+
+	flds	s10, [ CO1 ]				// load value from C
+	flds	s14, [ CO2 ]				// load value from C
+	fmuls	s10, s10, s1				// multiply with beta
+	add	CO1, CO1, r3				// compute next pointer
+	fmacs	s10, s0, s18				// multiply sum with alpha and add to value of C	
+	add	CO2, CO2, r3				// compute next pointer
+
+	flds	s11, [ CO1 ]				// load value from C
+	flds	s15, [ CO2 ]				// load value from C
+	fmuls	s11, s11, s1				// multiply with beta
+	mov	CO1, r2					// restore pointer
+	fmacs	s11, s0, s19				// multiply sum with alpha and add to value of C	
+	mov	CO2, r4					// restore pointer
+	
+	fsts	s8, [ CO1 ]				// store value in C
+	add	CO1 , CO1, r3				// compute next pointer
+	fsts	s9, [ CO1 ]				// store value in C
+	add	CO1 , CO1, r3				// compute next pointer
+	fsts	s10, [ CO1 ]				// store value in C
+	add	CO1 , CO1, r3				// compute next pointer
+	fsts	s11, [ CO1 ]				// store value in C
+
+//-----------------------------------------------------------
+	mov	r2, CO3					// save pointer
+	flds	s8, [ CO3 ]				// load value from C
+	fmuls	s12, s12, s1				// multiply with beta
+	add	CO3, CO3, r3				// compute next pointer
+	fmacs	s12, s0, s20				// multiply sum with alpha and add to value of C	
+
+	flds	s9, [ CO3 ]				// load value from C
+	fmuls	s13, s13, s1				// multiply with beta
+	add	CO3, CO3, r3				// compute next pointer
+	fmacs	s13, s0, s21				// multiply sum with alpha and add to value of C	
+
+	flds	s10, [ CO3 ]				// load value from C
+	fmuls	s14, s14, s1				// multiply with beta
+	add	CO3, CO3, r3				// compute next pointer
+	fmacs	s14, s0, s22				// multiply sum with alpha and add to value of C	
+
+	flds	s11, [ CO3 ]				// load value from C
+	fmuls	s15, s15, s1				// multiply with beta
+	mov	CO3, r2					// restore pointer
+	fmacs	s15, s0, s23				// multiply sum with alpha and add to value of C	
+	
+	fsts	s12, [ CO2 ]				// store value in C
+	add	CO2 , CO2, r3				// compute next pointer
+	fsts	s13, [ CO2 ]				// store value in C
+	add	CO2 , CO2, r3				// compute next pointer
+	fsts	s14, [ CO2 ]				// store value in C
+	add	CO2 , CO2, r3				// compute next pointer
+	fsts	s15, [ CO2 ]				// store value in C
+
+//-----------------------------------------------------------
+	mov	r4, CO4					// save pointer
+	flds	s12, [ CO4 ]				// load value from C
+	fmuls	s8, s8, s1				// multiply with beta
+	add	CO4, CO4, r3				// compute next pointer
+	fmacs	s8, s0, s24				// multiply sum with alpha and add to value of C	
+
+	flds	s13, [ CO4 ]				// load value from C
+	fmuls	s9, s9, s1				// multiply with beta
+	add	CO4, CO4, r3				// compute next pointer
+	fmacs	s9, s0, s25				// multiply sum with alpha and add to value of C	
+
+	flds	s14, [ CO4 ]				// load value from C
+	fmuls	s10, s10, s1				// multiply with beta
+	add	CO4, CO4, r3				// compute next pointer
+	fmacs	s10, s0, s26				// multiply sum with alpha and add to value of C	
+
+	flds	s15, [ CO4 ]				// load value from C
+	fmuls	s11, s11, s1				// multiply with beta
+	mov	CO4, r4					// restore pointer
+	fmacs	s11, s0, s27				// multiply sum with alpha and add to value of C	
+	
+
+//-----------------------------------------------------------
+	fsts	s8, [ CO3 ]				// store value in C
+	fmuls	s12, s12, s1				// multiply with beta
+	add	CO3 , CO3, r3				// compute next pointer
+	fmacs	s12, s0, s28				// multiply sum with alpha and add to value of C	
+
+	fsts	s9, [ CO3 ]				// store value in C
+	fmuls	s13, s13, s1				// multiply with beta
+	add	CO3 , CO3, r3				// compute next pointer
+	fmacs	s13, s0, s29				// multiply sum with alpha and add to value of C	
+
+	fsts	s10, [ CO3 ]				// store value in C
+	fmuls	s14, s14, s1				// multiply with beta
+	add	CO3 , CO3, r3				// compute next pointer
+	fmacs	s14, s0, s30				// multiply sum with alpha and add to value of C	
+
+	fsts	s11, [ CO3 ]				// store value in C
+	fmuls	s15, s15, s1				// multiply with beta
+	fsts	s12, [ CO4 ]				// store value in C
+	fmacs	s15, s0, s31				// multiply sum with alpha and add to value of C	
+	
+	add	CO4 , CO4, r3				// compute next pointer
+	fsts	s13, [ CO4 ]				// store value in C
+	add	CO4 , CO4, r3				// compute next pointer
+	fsts	s14, [ CO4 ]				// store value in C
+	add	CO4 , CO4, r3				// compute next pointer
+	fsts	s15, [ CO4 ]				// store value in C
+
+.endm
+
+
+/**************************************************************************************
+* End of macro definitions
+**************************************************************************************/
+
+        .arm             	
+        .global REALNAME 	
+        .func   REALNAME 	
+
+REALNAME:
+
+	push	{r4 - r9, fp}					// save register
+	add	fp, sp, #28					// add number of saved register multiplied by size of int
+	sub	sp, sp, #STACKSIZE				// reserve stack
+
+	mov	AO, OLD_A					// pointer matrix A
+	mov	BO, OLD_B					// pointer matrix B
+
+	sub	r3, fp, #128
+	vstm	r3, { s8 - s31 } 				// store floating point registers
+
+	ldr	r2, OLD_C					// pointer matrix C
+	ldr	r3, OLD_CSC					// Col stride size of C
+	lsl	r3, r3, #2					// multiply with size of float
+
+	mov	CO1, r2						// first line of C
+	add	CO2, CO1, r3					// second line of C
+	add	CO3, CO2, r3					// third line of C
+	add	CO4, CO3, r3					// fourth line of C
+
+	pld	[ CO1, #C_PRE ]					// prefetch the lines of C
+	pld	[ CO2, #C_PRE ]					// prefetch the lines of C
+	pld	[ CO3, #C_PRE ]					// prefetch the lines of C
+	pld	[ CO3, #C_PRE ]					// prefetch the lines of C
+
+sgemm_kernel_L4_M4_20:
+
+	asrs	L , K, #3					// L = K / 8
+	cmp	L , #2
+	blt	sgemm_kernel_L4_M4_32
+
+	KERNEL4x4_I
+	KERNEL4x4_M2
+	KERNEL4x4_M1
+	KERNEL4x4_M2
+
+	KERNEL4x4_M1
+	KERNEL4x4_M2
+	KERNEL4x4_M1
+	KERNEL4x4_M2
+
+	subs	L, L, #2
+	ble	sgemm_kernel_L4_M4_22a
+	.align 5
+
+sgemm_kernel_L4_M4_22:
+
+	KERNEL4x4_M1
+	KERNEL4x4_M2
+	KERNEL4x4_M1
+	KERNEL4x4_M2
+
+	KERNEL4x4_M1
+	KERNEL4x4_M2
+	KERNEL4x4_M1
+	KERNEL4x4_M2
+
+	subs	L, L, #1
+	bgt	sgemm_kernel_L4_M4_22
+
+sgemm_kernel_L4_M4_22a:
+
+	KERNEL4x4_M1
+	KERNEL4x4_M2
+	KERNEL4x4_M1
+	KERNEL4x4_M2
+
+	KERNEL4x4_M1
+	KERNEL4x4_M2
+	KERNEL4x4_M1
+	KERNEL4x4_E
+
+	b	 sgemm_kernel_L4_M4_44
+
+sgemm_kernel_L4_M4_32:
+
+	tst	L, #1
+	ble	sgemm_kernel_L4_M4_40
+
+	KERNEL4x4_I
+	KERNEL4x4_M2
+	KERNEL4x4_M1
+	KERNEL4x4_M2
+
+	KERNEL4x4_M1
+	KERNEL4x4_M2
+	KERNEL4x4_M1
+	KERNEL4x4_E
+
+	b	 sgemm_kernel_L4_M4_44
+
+sgemm_kernel_L4_M4_40:
+
+	INIT4x4
+
+sgemm_kernel_L4_M4_44:
+
+	ands	L , K, #7					// L = K % 8
+	ble	sgemm_kernel_L4_M4_100
+
+sgemm_kernel_L4_M4_46:
+
+	KERNEL4x4_SUB
+
+	subs	L, L, #1
+	bne	sgemm_kernel_L4_M4_46
+	
+sgemm_kernel_L4_M4_100:
+
+	SAVE4x4
+
+sgemm_kernel_L999:
+
+	sub	r3, fp, #128
+	vldm	r3, { s8 - s31 }				// restore floating point registers
+
+	sub	sp, fp, #28
+	pop	{r4 - r9, fp}
+	bx	lr
+
--- a/config/armv7a/kernels/3/bli_zgemm_kernel_2x2.S
+++ b/config/armv7a/kernels/3/bli_zgemm_kernel_2x2.S
@@ -0,0 +1,506 @@
+
+#define REALNAME bli_zgemm_kernel_2x2
+
+#define STACKSIZE 256
+
+#define	K		r0
+#define	PTR_ALPHA	r1
+#define	OLD_A		r2
+#define	OLD_B		r3
+#define PTR_BETA	[fp, #0 ]
+#define OLD_C		[fp, #4 ]
+#define OLD_RSC		[fp, #8 ]
+#define OLD_CSC		[fp, #12 ]
+#define AUX		[fp, #16 ]
+
+/******************************************************
+* [fp, #-128] - [fp, #-64] is reserved
+* for store and restore of floating point
+* register
+*******************************************************/
+
+#define L	r2
+
+#define	AO	r5
+#define	BO	r6
+
+#define	CO1	r7
+#define	CO2	r8
+
+
+#define A_PRE	96
+#define B_PRE	96
+#define C_PRE	0
+
+/**************************************************************************************
+* Macro definitions
+**************************************************************************************/
+
+#define FMAC_BR	fnmacd
+#define FMAC_BI	fmacd
+
+#define NN 1
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT) 
+
+	#define	FADD_R	fsubd
+	#define	FADD_I	faddd
+
+	#define	FMAC_R1	fnmacd
+	#define	FMAC_R2	fnmacd
+	#define	FMAC_I1	fmacd
+	#define	FMAC_I2	fnmacd
+
+#elif defined(CN) || defined(CT)
+
+	#define	FADD_R	faddd
+	#define	FADD_I	fsubd
+
+	#define	FMAC_R1	fmacd
+	#define	FMAC_R2	fmacd
+	#define	FMAC_I1	fnmacd
+	#define	FMAC_I2	fmacd
+
+#elif defined(NC) || defined(TC)
+
+	#define	FADD_R	faddd
+	#define	FADD_I	fsubd
+
+	#define	FMAC_R1	fmacd
+	#define	FMAC_R2	fnmacd
+	#define	FMAC_I1	fmacd
+	#define	FMAC_I2	fmacd
+
+#else
+
+	#define	FADD_R  fsubd
+	#define	FADD_I	faddd
+
+	#define	FMAC_R1	fnmacd
+	#define	FMAC_R2	fmacd
+	#define	FMAC_I1	fnmacd
+	#define	FMAC_I2	fnmacd
+
+#endif
+
+
+
+.macro INIT2x2
+
+	vsub.f64		d16 , d16 , d16
+	vmov.f64		d17, d16
+	vmov.f64		d18, d16
+	vmov.f64		d19, d16
+	vmov.f64		d20, d16
+	vmov.f64		d21, d16
+	vmov.f64		d22, d16
+	vmov.f64		d23, d16
+	vmov.f64		d24, d16
+	vmov.f64		d25, d16
+	vmov.f64		d26, d16
+	vmov.f64		d27, d16
+	vmov.f64		d28, d16
+	vmov.f64		d29, d16
+	vmov.f64		d30, d16
+	vmov.f64		d31, d16
+
+.endm
+
+.macro KERNEL2x2_I
+	pld	[ AO , #A_PRE ]
+	pld	[ BO , #B_PRE ]
+	fldd	d0 , [ AO ]
+	fldd	d1 , [ AO, #8 ]
+	fldd	d8 , [ BO ]
+	fldd	d9 , [ BO, #8 ]
+
+	fmuld	d16  , d0,  d8
+	fldd	d2 , [ AO, #16 ]
+	fmuld	d24  , d1,  d9
+	fldd	d3 , [ AO, #24 ]
+	fmuld	d17  , d0,  d9
+	fldd	d10, [ BO, #16 ]
+	fmuld	d25  , d1,  d8
+
+	fldd	d11, [ BO, #24 ]
+	fmuld	d18  , d2,  d8
+	add	BO , BO, #32
+	fmuld	d26  , d3,  d9
+	add	AO , AO, #32
+	fmuld	d19  , d2,  d9
+	pld	[ BO , #B_PRE ]
+	fmuld	d27  , d3,  d8
+
+	pld	[ AO , #A_PRE ]
+	fmuld	d20  , d0,  d10
+	fldd	d4 , [ AO, #0 ]
+	fmuld	d28  , d1,  d11
+	fldd	d5 , [ AO, #8 ]
+	fmuld	d21  , d0,  d11
+	fldd	d12, [ BO ]
+	fmuld	d29  , d1,  d10
+
+	fldd	d13, [ BO, #8 ]
+	fmuld	d22  , d2,  d10
+	fldd	d6 , [ AO, #16 ]
+	fmuld	d30  , d3,  d11
+	fldd	d7 , [ AO, #24 ]
+	fmuld	d23  , d2,  d11
+	fldd	d14, [ BO, #16 ]
+	fmuld	d31  , d3,  d10
+	fldd	d15, [ BO, #24 ]
+
+	add	BO , BO, #32
+	add	AO , AO, #32
+.endm
+
+
+
+.macro KERNEL2x2_M1
+	pld	[ AO , #A_PRE ]
+
+	fmacd	d16  , d0,  d8
+	pld	[ BO , #B_PRE ]
+	fmacd	d24  , d1,  d9
+	fldd	d4 , [ AO, #0 ]
+	fmacd	d17  , d0,  d9
+	fldd	d5 , [ AO, #8 ]
+	fmacd	d25  , d1,  d8
+
+	fldd	d12, [ BO ]
+	fmacd	d18  , d2,  d8
+	fldd	d13, [ BO, #8 ]
+	fmacd	d26  , d3,  d9
+	fldd	d6 , [ AO, #16 ]
+	fmacd	d19  , d2,  d9
+	fldd	d7 , [ AO, #24 ]
+	fmacd	d27  , d3,  d8
+
+	fmacd	d20  , d0,  d10
+	fldd	d14, [ BO, #16 ]
+	fmacd	d28  , d1,  d11
+	fmacd	d21  , d0,  d11
+	fldd	d15, [ BO, #24 ]
+	fmacd	d29  , d1,  d10
+
+	fmacd	d22  , d2,  d10
+	add	BO , BO, #32
+	fmacd	d30  , d3,  d11
+	fmacd	d23  , d2,  d11
+	add	AO , AO, #32
+	fmacd	d31  , d3,  d10
+
+.endm
+
+.macro KERNEL2x2_M2
+	pld	[ AO , #A_PRE ]
+
+	fmacd	d16  , d4,  d12
+	pld	[ BO , #B_PRE ]
+	fmacd	d24  , d5,  d13
+	fldd	d0 , [ AO, #0 ]
+	fmacd	d17  , d4,  d13
+	fldd	d1 , [ AO, #8 ]
+	fmacd	d25  , d5,  d12
+
+	fmacd	d18  , d6,  d12
+	fldd	d8 , [ BO ]
+	fmacd	d26  , d7,  d13
+	fldd	d9 , [ BO, #8 ]
+	fmacd	d19  , d6,  d13
+	fmacd	d27  , d7,  d12
+
+	fldd	d2 , [ AO, #16 ]
+	fmacd	d20  , d4,  d14
+	fldd	d3 , [ AO, #24 ]
+	fmacd	d28  , d5,  d15
+	fmacd	d21  , d4,  d15
+	fldd	d10, [ BO, #16 ]
+	fmacd	d29  , d5,  d14
+
+	fldd	d11, [ BO, #24 ]
+	fmacd	d22  , d6,  d14
+	fmacd	d30  , d7,  d15
+	add	BO , BO, #32
+	fmacd	d23  , d6,  d15
+	add	AO , AO, #32
+	fmacd	d31  , d7,  d14
+
+.endm
+
+
+.macro KERNEL2x2_E
+
+	fmacd	d16  , d4,  d12
+	fmacd	d24  , d5,  d13
+	fmacd	d17  , d4,  d13
+	fmacd	d25  , d5,  d12
+
+	fmacd	d18  , d6,  d12
+	fmacd	d26  , d7,  d13
+	fmacd	d19  , d6,  d13
+	fmacd	d27  , d7,  d12
+
+	fmacd	d20  , d4,  d14
+	fmacd	d28  , d5,  d15
+	fmacd	d21  , d4,  d15
+	fmacd	d29  , d5,  d14
+
+	fmacd	d22  , d6,  d14
+	fmacd	d30  , d7,  d15
+	fmacd	d23  , d6,  d15
+	fmacd	d31  , d7,  d14
+
+.endm
+
+.macro KERNEL2x2_SUB
+
+	pld	[ AO , #A_PRE ]
+	pld	[ BO , #B_PRE ]
+	fldd	d0 , [ AO ]
+	fldd	d1 , [ AO, #8 ]
+	fldd	d8 , [ BO ]
+	fldd	d9 , [ BO, #8 ]
+
+	fmacd	d16  , d0,  d8
+	fldd	d2 , [ AO, #16 ]
+	fmacd	d24  , d1,  d9
+	fldd	d3 , [ AO, #24 ]
+	fmacd	d17  , d0,  d9
+	fldd	d10, [ BO, #16 ]
+	fmacd	d25  , d1,  d8
+
+	fldd	d11, [ BO, #24 ]
+	fmacd	d18  , d2,  d8
+	fmacd	d26  , d3,  d9
+	fmacd	d19  , d2,  d9
+	fmacd	d27  , d3,  d8
+
+	fmacd	d20  , d0,  d10
+	fmacd	d28  , d1,  d11
+	fmacd	d21  , d0,  d11
+	fmacd	d29  , d1,  d10
+
+	fmacd	d22  , d2,  d10
+	add	BO , BO, #32
+	fmacd	d30  , d3,  d11
+	fmacd	d23  , d2,  d11
+	add	AO , AO, #32
+	fmacd	d31  , d3,  d10
+
+.endm
+
+
+
+
+.macro SAVE2x2
+
+        ldr     r3, OLD_RSC                             // Row stride size
+        lsl     r3, r3, #4                              // multiply with size of complex double
+
+        fldd    d0, [ PTR_ALPHA ]                       // load real part of alpha
+        fldd    d1, [ PTR_ALPHA, #8 ]                   // load imag part of alpha
+        ldr     r4, PTR_BETA
+        fldd    d2, [ r4 ]                              // load real part of beta
+        fldd    d3, [ r4, #8  ]                         // load imag part of beta
+
+	// Add/Sub the real and the imag parts
+	FADD_R	d16, d24 , d16
+	FADD_I  d17, d25 , d17
+	FADD_R	d18, d26 , d18
+	FADD_I  d19, d27 , d19
+	FADD_R	d20, d28 , d20
+	FADD_I  d21, d29 , d21
+	FADD_R	d22, d30 , d22
+	FADD_I  d23, d31 , d23
+
+	mov	r4, CO1					// save pointer
+	fldmiad CO1, { d4 - d5 }			// read real and imag part from C
+	add	CO1, CO1, r3
+	
+	mov	r2, CO2					// save pointer
+	fldmiad CO2, { d8 - d9 }			// read real and imag part from C
+	add	CO2, CO2, r3
+
+	fmuld	d24, d4, d2				// multiply Beta-real with C-real
+	fmuld	d25, d5, d2				// multiply Beta-real with C-imag
+	fmuld	d28, d8, d2				// multiply Beta-real with C-real
+	fmuld	d29, d9, d2				// multiply Beta-real with C-imag
+
+	FMAC_BR	d24, d3, d5				// multiply beta-imag with C-imag and add
+	FMAC_BI	d25, d3, d4				// multiply beta-imag with C-real and add
+	FMAC_BR	d28, d3, d9				// multiply beta-imag with C-imag and add
+	FMAC_BI	d29, d3, d8				// multiply beta-imag with C-real and add
+
+	FMAC_R1 d24 , d0 , d16
+	FMAC_I1 d25 , d0 , d17
+	FMAC_R2 d24 , d1 , d17
+	FMAC_I2	d25 , d1 , d16
+
+	FMAC_R1 d28 , d0 , d20
+	FMAC_I1 d29 , d0 , d21
+	FMAC_R2 d28 , d1 , d21
+	FMAC_I2	d29 , d1 , d20
+
+	fldmiad CO1, { d4 - d5 }			// read real and imag part from C
+	fldmiad CO2, { d8 - d9 }			// read real and imag part from C
+
+	fmuld	d26, d4, d2				// multiply Beta-real with C-real
+	fmuld	d27, d5, d2				// multiply Beta-real with C-imag
+	fmuld	d30, d8, d2				// multiply Beta-real with C-real
+	fmuld	d31, d9, d2				// multiply Beta-real with C-imag
+
+	FMAC_BR	d26, d3, d5				// multiply beta-imag with C-imag and add
+	FMAC_BI	d27, d3, d4				// multiply beta-imag with C-real and add
+	FMAC_BR	d30, d3, d9				// multiply beta-imag with C-imag and add
+	FMAC_BI	d31, d3, d8				// multiply beta-imag with C-real and add
+
+	FMAC_R1 d26 , d0 , d18
+	FMAC_I1 d27 , d0 , d19
+	FMAC_R2 d26 , d1 , d19
+	FMAC_I2	d27 , d1 , d18
+
+	FMAC_R1 d30, d0 , d22
+	FMAC_I1 d31, d0 , d23
+	FMAC_R2 d30, d1 , d23
+	FMAC_I2	d31, d1 , d22
+
+	mov	CO1, r4					// restore pointer
+	mov	CO2, r2					// restore pointer
+	fstmiad CO1, { d24 - d25 }
+	fstmiad CO2, { d28 - d29 }
+	add	CO1, CO1, r3
+	add	CO2, CO2, r3
+	fstmiad CO1, { d26 - d27 }
+	fstmiad CO2, { d30 - d31 }
+
+
+.endm
+
+
+
+/**************************************************************************************
+* End of macro definitions
+**************************************************************************************/
+
+        .arm             	
+        .global REALNAME 	
+        .func   REALNAME 	
+
+REALNAME:
+
+	push	{r4 - r9, fp}					// save register
+	add	fp, sp, #28					// add number of saved register multiplied by size of int
+	sub	sp, sp, #STACKSIZE				// reserve stack
+
+	mov	AO, OLD_A					// pointer matrix A
+	mov	BO, OLD_B					// pointer matrix B
+
+	sub	r3, fp, #128
+	vstm	r3, { d8 - d15} 				// store floating point registers
+
+	ldr	r2, OLD_C					// pointer matrix C
+	ldr	r3, OLD_CSC					// Col stride size of C
+	lsl	r3, r3, #4					// multiply with size of complex double
+
+	mov	CO1, r2						// first line of C
+	add	CO2, CO1, r3					// second line of C
+
+	pld	[ CO1, #C_PRE ]					// prefetch the lines of C
+	pld	[ CO2, #C_PRE ]					// prefetch the lines of C
+
+zgemm_kernel_L2_M2_20:
+
+	asrs	L , K, #3					// L = K / 8
+	cmp	L , #2
+	blt	zgemm_kernel_L2_M2_32
+
+	KERNEL2x2_I
+	KERNEL2x2_M2
+	KERNEL2x2_M1
+	KERNEL2x2_M2
+
+	KERNEL2x2_M1
+	KERNEL2x2_M2
+	KERNEL2x2_M1
+	KERNEL2x2_M2
+
+	subs	L, L, #2
+	ble	zgemm_kernel_L2_M2_22a
+	.align 5
+
+zgemm_kernel_L2_M2_22:
+
+	KERNEL2x2_M1
+	KERNEL2x2_M2
+	KERNEL2x2_M1
+	KERNEL2x2_M2
+
+	KERNEL2x2_M1
+	KERNEL2x2_M2
+	KERNEL2x2_M1
+	KERNEL2x2_M2
+
+	subs	L, L, #1
+	bgt	zgemm_kernel_L2_M2_22
+
+zgemm_kernel_L2_M2_22a:
+
+	KERNEL2x2_M1
+	KERNEL2x2_M2
+	KERNEL2x2_M1
+	KERNEL2x2_M2
+
+	KERNEL2x2_M1
+	KERNEL2x2_M2
+	KERNEL2x2_M1
+	KERNEL2x2_E
+
+	b	 zgemm_kernel_L2_M2_44
+
+zgemm_kernel_L2_M2_32:
+
+	tst	L, #1
+	ble	zgemm_kernel_L2_M2_40
+
+	KERNEL2x2_I
+	KERNEL2x2_M2
+	KERNEL2x2_M1
+	KERNEL2x2_M2
+
+	KERNEL2x2_M1
+	KERNEL2x2_M2
+	KERNEL2x2_M1
+	KERNEL2x2_E
+
+	b	 zgemm_kernel_L2_M2_44
+
+zgemm_kernel_L2_M2_40:
+
+	INIT2x2
+
+zgemm_kernel_L2_M2_44:
+
+	ands	L , K, #7					// L = K % 8
+	ble	zgemm_kernel_L2_M2_100
+
+zgemm_kernel_L2_M2_46:
+
+	KERNEL2x2_SUB
+
+	subs	L, L, #1
+	bne	zgemm_kernel_L2_M2_46
+	
+zgemm_kernel_L2_M2_100:
+
+	SAVE2x2
+
+zgemm_kernel_L999:
+
+	sub	r3, fp, #128
+	vldm	r3, { d8 - d15}					// restore floating point registers
+
+	sub	sp, fp, #28
+	pop	{r4 - r9, fp}
+	bx	lr
+
--- a/config/armv7a/make_defs.mk
+++ b/config/armv7a/make_defs.mk
@@ -0,0 +1,108 @@
+#!/bin/bash
+#
+#  BLIS    
+#  An object-based framework for developing high-performance BLAS-like
+#  libraries.
+#
+#  Copyright (C) 2014, The University of Texas
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are
+#  met:
+#   - Redistributions of source code must retain the above copyright
+#     notice, this list of conditions and the following disclaimer.
+#   - Redistributions in binary form must reproduce the above copyright
+#     notice, this list of conditions and the following disclaimer in the
+#     documentation and/or other materials provided with the distribution.
+#   - Neither the name of The University of Texas nor the names of its
+#     contributors may be used to endorse or promote products derived
+#     from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#
+
+# Only include this block of code once.
+ifndef MAKE_DEFS_MK_INCLUDED
+MAKE_DEFS_MK_INCLUDED := yes
+
+
+
+#
+# --- Build definitions --------------------------------------------------------
+#
+
+# Variables corresponding to other configure-time options.
+BLIS_ENABLE_VERBOSE_MAKE_OUTPUT := yes
+BLIS_ENABLE_STATIC_BUILD        := yes
+BLIS_ENABLE_DYNAMIC_BUILD       := no
+
+
+
+#
+# --- Utility program definitions ----------------------------------------------
+#
+
+SH         := /bin/sh
+MV         := mv
+MKDIR      := mkdir -p
+RM_F       := rm -f
+RM_RF      := rm -rf
+SYMLINK    := ln -sf
+FIND       := find
+GREP       := grep
+XARGS      := xargs
+RANLIB     := ranlib
+INSTALL    := install -c
+
+# Used to refresh CHANGELOG.
+GIT        := git
+GIT_LOG    := $(GIT) log --decorate
+
+
+
+#
+# --- Development tools definitions --------------------------------------------
+#
+
+# --- Determine the C compiler and related flags ---
+CC             := gcc
+# Enable IEEE Standard 1003.1-2004 (POSIX.1d). 
+# NOTE: This is needed to enable posix_memalign().
+CPPROCFLAGS    := -D_POSIX_C_SOURCE=200112L
+CMISCFLAGS     := -std=c99 -O3 -mfloat-abi=hard -mfpu=vfpv3 -marm -march=armv7-a #-g
+CDBGFLAGS      := #-g
+CWARNFLAGS     := -Wall
+COPTFLAGS      := -marm -march=armv7-a -mfpu=vfpv3 -O3 -mfloat-abi=hard #-g
+CKOPTFLAGS     := $(COPTFLAGS)
+CVECFLAGS      := #-msse3  # -mfpmath=sse
+
+# Aggregate all of the flags into multiple groups: one for standard
+# compilation, and one for each of the supported "special" compilation
+# modes.
+CFLAGS         := $(CDBGFLAGS) $(COPTFLAGS)  $(CVECFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
+CFLAGS_KERNELS := $(CDBGFLAGS) $(CKOPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
+CFLAGS_NOOPT   := $(CDBGFLAGS)                            $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
+
+# --- Determine the archiver and related flags ---
+AR             := ar
+ARFLAGS        := cru
+
+# --- Determine the linker and related flags ---
+LINKER         := $(CC)
+LDFLAGS        := -lm
+
+
+
+# end of ifndef MAKE_DEFS_MK_INCLUDED conditional block
+endif