From 627d59b5ba06866b26f46e4434a0435b600925e3 Mon Sep 17 00:00:00 2001
From: Etienne Sauvage <etienne.sauvage@gmail.com>
Date: Mon, 29 Feb 2016 21:53:12 +0100
Subject: [PATCH 01/10] symbolic link for bulldozer configuration to kernels

---
 config/bulldozer/kernels | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/config/bulldozer/kernels b/config/bulldozer/kernels
index 71a8d8d34..9ed7ea19a 120000
--- a/config/bulldozer/kernels
+++ b/config/bulldozer/kernels
@@ -1 +1 @@
-../../kernels/x86_64/bulldozer
\ No newline at end of file
+../../kernels/x86_64/bulldozer/
\ No newline at end of file

From 4ca5d5b1fd6f2e4a8b2e139c5405475239581e51 Mon Sep 17 00:00:00 2001
From: Etienne Sauvage <etienne.sauvage@gmail.com>
Date: Tue, 1 Mar 2016 21:33:01 +0100
Subject: [PATCH 02/10] sgemm micro-kernel for FMA4 instruction set (bulldozer
 configuration), based on x86_64/avx micro-kernel

---
 config/bulldozer/bli_kernel.h                 |   9 +-
 .../x86_64/bulldozer/3/bli_gemm_4x6_FMA4.c    | 927 ++++++++++++++++++
 2 files changed, 932 insertions(+), 4 deletions(-)

diff --git a/config/bulldozer/bli_kernel.h b/config/bulldozer/bli_kernel.h
index 388c6a1b4..c2b1e313a 100644
--- a/config/bulldozer/bli_kernel.h
+++ b/config/bulldozer/bli_kernel.h
@@ -51,9 +51,9 @@
 //     (b) MR (for zero-padding purposes when MR and NR are "swapped")
 //
 
-#define BLIS_DEFAULT_MC_S              256
-#define BLIS_DEFAULT_KC_S              256
-#define BLIS_DEFAULT_NC_S              8192
+#define BLIS_DEFAULT_MC_S              128
+#define BLIS_DEFAULT_KC_S              384
+#define BLIS_DEFAULT_NC_S              4096
 
 #define BLIS_DEFAULT_MC_D              1080
 #define BLIS_DEFAULT_KC_D              120
@@ -70,7 +70,7 @@
 // -- Register blocksizes --
 
 #define BLIS_DEFAULT_MR_S              8
-#define BLIS_DEFAULT_NR_S              4
+#define BLIS_DEFAULT_NR_S              8
 
 #define BLIS_DEFAULT_MR_D              4
 #define BLIS_DEFAULT_NR_D              6
@@ -149,6 +149,7 @@
 
 // -- gemm --
 
+#define BLIS_SGEMM_UKERNEL         bli_sgemm_8x8_FMA4
 #define BLIS_DGEMM_UKERNEL         bli_dgemm_4x6_FMA4
 
 // -- trsm-related --
diff --git a/kernels/x86_64/bulldozer/3/bli_gemm_4x6_FMA4.c b/kernels/x86_64/bulldozer/3/bli_gemm_4x6_FMA4.c
index c140d7ced..8fc716a9e 100644
--- a/kernels/x86_64/bulldozer/3/bli_gemm_4x6_FMA4.c
+++ b/kernels/x86_64/bulldozer/3/bli_gemm_4x6_FMA4.c
@@ -34,6 +34,933 @@
 
 #include "blis.h"
 
+void bli_sgemm_8x8_FMA4(
+                        dim_t              k,
+                        float* restrict    alpha,
+                        float* restrict    a,
+                        float* restrict    b,
+                        float* restrict    beta,
+                        float* restrict    c, inc_t rs_c, inc_t cs_c,
+                        auxinfo_t*         data
+                      )
+{
+	dim_t   k_iter = k / 4;
+	dim_t   k_left = k % 4;
+
+	__asm__ volatile
+	(
+	"                                           \n\t"
+	"movq                %2, %%rax              \n\t" // load address of a.
+	"movq                %3, %%rbx              \n\t" // load address of b.
+	"                                           \n\t"
+	"vmovaps    0 * 32(%%rax), %%ymm0           \n\t" // initialize loop by pre-loading
+	"vmovsldup  0 * 32(%%rbx), %%ymm2			\n\t" // elements of a and b.
+	"vpermilps   $0x4e, %%ymm2, %%ymm3          \n\t"
+	"                                           \n\t"
+	"movq                %6, %%rcx              \n\t" // load address of c
+	"movq                %8, %%rdi              \n\t" // load cs_c
+	"leaq        (,%%rdi,4), %%rdi              \n\t" // cs_c *= sizeof(float)
+	"leaq   (%%rcx,%%rdi,4), %%r10              \n\t" // load address of c + 4*cs_c;
+	"                                           \n\t"
+	"leaq   (%%rdi,%%rdi,2), %%r14              \n\t" // r14 = 3*cs_c;
+	"prefetcht0   7 * 8(%%rcx)                  \n\t" // prefetch c + 0*cs_c
+	"prefetcht0   7 * 8(%%rcx,%%rdi)            \n\t" // prefetch c + 1*cs_c
+	"prefetcht0   7 * 8(%%rcx,%%rdi,2)          \n\t" // prefetch c + 2*cs_c
+	"prefetcht0   7 * 8(%%rcx,%%r14)            \n\t" // prefetch c + 3*cs_c
+	"prefetcht0   7 * 8(%%r10)                  \n\t" // prefetch c + 4*cs_c
+	"prefetcht0   7 * 8(%%r10,%%rdi)            \n\t" // prefetch c + 5*cs_c
+	"prefetcht0   7 * 8(%%r10,%%rdi,2)          \n\t" // prefetch c + 6*cs_c
+	"prefetcht0   7 * 8(%%r10,%%r14)            \n\t" // prefetch c + 7*cs_c
+	"                                           \n\t"
+	"vxorps    %%ymm8,  %%ymm8,  %%ymm8         \n\t"
+	"vxorps    %%ymm9,  %%ymm9,  %%ymm9         \n\t"
+	"vxorps    %%ymm10, %%ymm10, %%ymm10        \n\t"
+	"vxorps    %%ymm11, %%ymm11, %%ymm11        \n\t"
+	"vxorps    %%ymm12, %%ymm12, %%ymm12        \n\t"
+	"vxorps    %%ymm13, %%ymm13, %%ymm13        \n\t"
+	"vxorps    %%ymm14, %%ymm14, %%ymm14        \n\t"
+	"vxorps    %%ymm15, %%ymm15, %%ymm15        \n\t"
+	"                                           \n\t"
+	"                                           \n\t"
+	"movq      %0, %%rsi                        \n\t" // i = k_iter;
+	"testq  %%rsi, %%rsi                        \n\t" // check i via logical AND.
+	"je     .SCONSIDKLEFT                       \n\t" // if i == 0, jump to code that
+	"                                           \n\t" // contains the k_left loop.
+	"                                            \n\t"
+	".SLOOPKITER:                                \n\t" // MAIN LOOP
+	"                                            \n\t"
+	"                                            \n\t" // iteration 0
+	"prefetcht0  16 * 32(%%rax)                  \n\t"
+	"vfmaddps	%%ymm15, %%ymm0,  %%ymm2, %%ymm15\n\t"
+	"vperm2f128 $0x03, %%ymm2,  %%ymm2, %%ymm4   \n\t"
+	"vmovshdup  0 * 32(%%rbx),  %%ymm2          \n\t"
+	"vfmaddps	%%ymm13, %%ymm0,  %%ymm3, %%ymm13\n\t"
+	"vperm2f128 $0x03, %%ymm3,  %%ymm3, %%ymm5   \n\t"
+	"                                            \n\t"
+	"vmovaps    1 * 32(%%rax),  %%ymm1           \n\t"
+	"vpermilps  $0x4e, %%ymm2,  %%ymm3           \n\t"
+	"vfmaddps	%%ymm11, %%ymm0,  %%ymm4, %%ymm11\n\t"
+	"vfmaddps	%%ymm9, %%ymm0,  %%ymm5, %%ymm9	\n\t"
+	"                                            \n\t"
+	"vfmaddps	%%ymm14, %%ymm0,  %%ymm2, %%ymm14\n\t"
+	"vperm2f128 $0x03, %%ymm2,  %%ymm2, %%ymm4   \n\t"
+	"vmovsldup  1 * 32(%%rbx),  %%ymm2           \n\t"
+	"vfmaddps	%%ymm12, %%ymm0,  %%ymm3, %%ymm12\n\t"
+	"vperm2f128 $0x03, %%ymm3,  %%ymm3, %%ymm5   \n\t"
+	"                                            \n\t"
+	"vpermilps  $0x4e, %%ymm2,  %%ymm3           \n\t"
+	"vfmaddps	%%ymm10, %%ymm0,  %%ymm4, %%ymm10\n\t"
+	"vfmaddps	%%ymm8, %%ymm0,  %%ymm5, %%ymm8\n\t"
+	"                                            \n\t"
+	"                                            \n\t" // iteration 1
+	"vfmaddps	%%ymm15, %%ymm1,  %%ymm2, %%ymm15\n\t"
+	"vperm2f128 $0x03, %%ymm2,  %%ymm2, %%ymm4   \n\t"
+	"vmovshdup  1 * 32(%%rbx), %%ymm2            \n\t"
+	"vfmaddps	%%ymm13, %%ymm1,  %%ymm3, %%ymm13\n\t"
+	"vperm2f128 $0x03, %%ymm3,  %%ymm3, %%ymm5   \n\t"
+	"                                            \n\t"
+	"vmovaps    2 * 32(%%rax),  %%ymm0           \n\t"
+	"vpermilps  $0x4e, %%ymm2,  %%ymm3           \n\t"
+	"vfmaddps	%%ymm11, %%ymm1,  %%ymm4, %%ymm11\n\t"
+	"vfmaddps	%%ymm9, %%ymm1,  %%ymm5, %%ymm9\n\t"
+	"                                            \n\t"
+	"vfmaddps	%%ymm14, %%ymm1,  %%ymm2, %%ymm14\n\t"
+	"vperm2f128 $0x03, %%ymm2,  %%ymm2, %%ymm4   \n\t"
+	"vmovsldup  2 * 32(%%rbx),  %%ymm2           \n\t"
+	"vfmaddps	%%ymm12, %%ymm1,  %%ymm3, %%ymm12\n\t"
+	"vperm2f128 $0x03, %%ymm3,  %%ymm3, %%ymm5   \n\t"
+	"                                            \n\t"
+	"vpermilps  $0x4e, %%ymm2,  %%ymm3           \n\t"
+	"vfmaddps	%%ymm10, %%ymm1,  %%ymm4, %%ymm10\n\t"
+	"vfmaddps	%%ymm8, %%ymm1,  %%ymm5, %%ymm8\n\t"
+	"                                            \n\t"
+	"                                            \n\t" // iteration 2
+	"prefetcht0  18 * 32(%%rax)                  \n\t"
+	"vfmaddps	%%ymm15, %%ymm0,  %%ymm2, %%ymm15\n\t"
+	"vperm2f128 $0x03, %%ymm2,  %%ymm2, %%ymm4   \n\t"
+	"vmovshdup  2 * 32(%%rbx),  %%ymm2           \n\t"
+	"vfmaddps	%%ymm13, %%ymm0,  %%ymm3, %%ymm13\n\t"
+	"vperm2f128 $0x03, %%ymm3,  %%ymm3, %%ymm5   \n\t"
+	"                                            \n\t"
+	"vmovaps    3 * 32(%%rax),  %%ymm1           \n\t"
+	"addq           $4 * 8 * 4, %%rax            \n\t" // a += 4*8 (unroll x mr)
+	"vpermilps  $0x4e, %%ymm2,  %%ymm3           \n\t"
+	"vfmaddps	%%ymm11, %%ymm0,  %%ymm4, %%ymm11\n\t"
+	"vfmaddps	%%ymm9, %%ymm0,  %%ymm5, %%ymm9\n\t"
+	"                                            \n\t"
+	"vfmaddps	%%ymm14, %%ymm0,  %%ymm2, %%ymm14\n\t"
+	"vperm2f128 $0x03, %%ymm2,  %%ymm2, %%ymm4   \n\t"
+	"vmovsldup  3 * 32(%%rbx),  %%ymm2           \n\t"
+	"vfmaddps	%%ymm12, %%ymm0,  %%ymm3, %%ymm12\n\t"
+	"vperm2f128 $0x03, %%ymm3,  %%ymm3, %%ymm5   \n\t"
+	"                                            \n\t"
+	"vpermilps  $0x4e, %%ymm2,  %%ymm3           \n\t"
+	"vfmaddps	%%ymm10, %%ymm0,  %%ymm4, %%ymm10\n\t"
+	"vfmaddps	%%ymm8, %%ymm0,  %%ymm5, %%ymm8\n\t"
+	"                                            \n\t"
+	"                                            \n\t" // iteration 3
+	"vfmaddps	%%ymm15, %%ymm1,  %%ymm2, %%ymm15\n\t"
+	"vperm2f128 $0x03, %%ymm2,  %%ymm2, %%ymm4   \n\t"
+	"vmovshdup  3 * 32(%%rbx), %%ymm2            \n\t"
+	"addq           $4 * 8 * 4, %%rbx            \n\t" // b += 4*8 (unroll x nr)
+	"vfmaddps	%%ymm13, %%ymm1,  %%ymm3, %%ymm13\n\t"
+	"vperm2f128 $0x03, %%ymm3,  %%ymm3, %%ymm5   \n\t"
+	"                                            \n\t"
+	"vmovaps    0 * 32(%%rax),  %%ymm0           \n\t"
+	"vpermilps  $0x4e, %%ymm2,  %%ymm3           \n\t"
+	"vfmaddps	%%ymm11, %%ymm1,  %%ymm4, %%ymm11\n\t"
+	"vfmaddps	%%ymm9, %%ymm1,  %%ymm5, %%ymm9\n\t"
+	"                                            \n\t"
+	"vfmaddps	%%ymm14, %%ymm1,  %%ymm2, %%ymm14\n\t"
+	"vperm2f128 $0x03, %%ymm2,  %%ymm2, %%ymm4   \n\t"
+	"vmovsldup  0 * 32(%%rbx),  %%ymm2           \n\t"
+	"vfmaddps	%%ymm12, %%ymm1,  %%ymm3, %%ymm12\n\t"
+	"vperm2f128 $0x03, %%ymm3,  %%ymm3, %%ymm5   \n\t"
+	"                                            \n\t"
+	"vpermilps  $0x4e, %%ymm2,  %%ymm3           \n\t"
+	"vfmaddps	%%ymm10, %%ymm1,  %%ymm4, %%ymm10\n\t"
+	"vfmaddps	%%ymm8, %%ymm1,  %%ymm5, %%ymm8\n\t"
+	"                                            \n\t"
+	"                                            \n\t"
+	"                                            \n\t"
+	"decq   %%rsi                                \n\t" // i -= 1;
+	"jne    .SLOOPKITER                          \n\t" // iterate again if i != 0.
+	"                                            \n\t"
+	"                                            \n\t"
+	"                                            \n\t"
+	"                                            \n\t"
+	".SCONSIDKLEFT:                              \n\t"
+	"                                            \n\t"
+	"movq      %1, %%rsi                         \n\t" // i = k_left;
+	"testq  %%rsi, %%rsi                         \n\t" // check i via logical AND.
+	"je     .SPOSTACCUM                          \n\t" // if i == 0, we're done; jump to end.
+	"                                            \n\t" // else, we prepare to enter k_left loop.
+	"                                            \n\t"
+	".SLOOPKLEFT:                                \n\t" // EDGE LOOP
+	"                                            \n\t"
+	"prefetcht0  16 * 32(%%rax)                  \n\t"
+	"vfmaddps	%%ymm15, %%ymm0,  %%ymm2, %%ymm15\n\t"
+	"vperm2f128  $0x3, %%ymm2,  %%ymm2, %%ymm4   \n\t"
+	"vmovshdup  0 * 32(%%rbx),  %%ymm2           \n\t"
+	"vfmaddps	%%ymm13, %%ymm0,  %%ymm3, %%ymm13\n\t"
+	"vperm2f128  $0x3, %%ymm3,  %%ymm3, %%ymm5   \n\t"
+	"                                            \n\t"
+	"vmovaps    1 * 32(%%rax),  %%ymm1           \n\t"
+	"addq           $8 * 1 * 4, %%rax            \n\t" // a += 8 (1 x mr)
+	"vpermilps  $0x4e, %%ymm2,  %%ymm3           \n\t"
+	"vfmaddps	%%ymm11, %%ymm0,  %%ymm4, %%ymm11\n\t"
+	"vfmaddps	%%ymm9, %%ymm0,  %%ymm5, %%ymm9\n\t"
+	"                                            \n\t"
+	"vfmaddps	%%ymm14, %%ymm0,  %%ymm2, %%ymm14\n\t"
+	"vperm2f128  $0x3, %%ymm2,  %%ymm2, %%ymm4   \n\t"
+	"vmovsldup  1 * 32(%%rbx),  %%ymm2           \n\t"
+	"addq           $8 * 1 * 4, %%rbx            \n\t" // b += 8 (1 x nr)
+	"vfmaddps	%%ymm12, %%ymm0,  %%ymm3, %%ymm12\n\t"
+	"vmulps            %%ymm0,  %%ymm3, %%ymm7   \n\t"
+	"                                            \n\t"
+	"vpermilps  $0x4e, %%ymm2,  %%ymm3           \n\t"
+	"vfmaddps	%%ymm10, %%ymm0,  %%ymm4, %%ymm10\n\t"
+	"vfmaddps	%%ymm8, %%ymm0,  %%ymm5, %%ymm8\n\t"
+	"vmovaps           %%ymm1,  %%ymm0           \n\t"
+	"                                            \n\t"
+	"                                            \n\t"
+	"decq   %%rsi                                \n\t" // i -= 1;
+	"jne    .SLOOPKLEFT                          \n\t" // iterate again if i != 0.
+	"                                            \n\t"
+	"                                            \n\t"
+	".SPOSTACCUM:                                \n\t"
+	"                                            \n\t" // ymm15:  ymm13:  ymm11:  ymm9:
+	"                                            \n\t" // ( ab00  ( ab02  ( ab04  ( ab06
+	"                                            \n\t" //   ab10    ab12    ab14    ab16  
+	"                                            \n\t" //   ab22    ab20    ab26    ab24
+	"                                            \n\t" //   ab32    ab30    ab36    ab34
+	"                                            \n\t" //   ab44    ab46    ab40    ab42
+	"                                            \n\t" //   ab54    ab56    ab50    ab52  
+	"                                            \n\t" //   ab66    ab64    ab62    ab60
+	"                                            \n\t" //   ab76 )  ab74 )  ab72 )  ab70 )
+	"                                            \n\t"
+	"                                            \n\t" // ymm14:  ymm12:  ymm10:  ymm8:
+	"                                            \n\t" // ( ab01  ( ab03  ( ab05  ( ab07
+	"                                            \n\t" //   ab11    ab13    ab15    ab17  
+	"                                            \n\t" //   ab23    ab21    ab27    ab25
+	"                                            \n\t" //   ab33    ab31    ab37    ab35
+	"                                            \n\t" //   ab45    ab47    ab41    ab43
+	"                                            \n\t" //   ab55    ab57    ab51    ab53  
+	"                                            \n\t" //   ab67    ab65    ab63    ab61
+	"                                            \n\t" //   ab77 )  ab75 )  ab73 )  ab71 )
+	"vmovaps          %%ymm15, %%ymm7            \n\t"
+	"vshufps   $0xe4, %%ymm13, %%ymm15, %%ymm15  \n\t"
+	"vshufps   $0xe4, %%ymm7,  %%ymm13, %%ymm13  \n\t"
+	"                                            \n\t"
+	"vmovaps          %%ymm11, %%ymm7            \n\t"
+	"vshufps   $0xe4, %%ymm9,  %%ymm11, %%ymm11  \n\t"
+	"vshufps   $0xe4, %%ymm7,  %%ymm9,  %%ymm9   \n\t"
+	"                                            \n\t"
+	"vmovaps          %%ymm14, %%ymm7            \n\t"
+	"vshufps   $0xe4, %%ymm12, %%ymm14, %%ymm14  \n\t"
+	"vshufps   $0xe4, %%ymm7,  %%ymm12, %%ymm12  \n\t"
+	"                                            \n\t"
+	"vmovaps          %%ymm10, %%ymm7            \n\t"
+	"vshufps   $0xe4, %%ymm8,  %%ymm10, %%ymm10  \n\t"
+	"vshufps   $0xe4, %%ymm7,  %%ymm8,  %%ymm8   \n\t"
+	"                                            \n\t" // ymm15:  ymm13:  ymm11:  ymm9:
+	"                                            \n\t" // ( ab00  ( ab02  ( ab04  ( ab06
+	"                                            \n\t" //   ab10    ab12    ab14    ab16  
+	"                                            \n\t" //   ab20    ab22    ab24    ab26
+	"                                            \n\t" //   ab30    ab32    ab34    ab36
+	"                                            \n\t" //   ab44    ab46    ab40    ab42
+	"                                            \n\t" //   ab54    ab56    ab50    ab52  
+	"                                            \n\t" //   ab64    ab66    ab60    ab62
+	"                                            \n\t" //   ab74 )  ab76 )  ab70 )  ab72 )
+	"                                            \n\t"
+	"                                            \n\t" // ymm14:  ymm12:  ymm10:  ymm8:
+	"                                            \n\t" // ( ab01  ( ab03  ( ab05  ( ab07
+	"                                            \n\t" //   ab11    ab13    ab15    ab17  
+	"                                            \n\t" //   ab21    ab23    ab25    ab27
+	"                                            \n\t" //   ab31    ab33    ab35    ab37
+	"                                            \n\t" //   ab45    ab47    ab41    ab43
+	"                                            \n\t" //   ab55    ab57    ab51    ab53  
+	"                                            \n\t" //   ab65    ab67    ab61    ab63
+	"                                            \n\t" //   ab75 )  ab77 )  ab71 )  ab73 )
+	"vmovaps           %%ymm15, %%ymm7           \n\t"
+	"vperm2f128 $0x30, %%ymm11, %%ymm15, %%ymm15 \n\t"
+	"vperm2f128 $0x12, %%ymm11, %%ymm7,  %%ymm11 \n\t"
+	"                                            \n\t"
+	"vmovaps           %%ymm13, %%ymm7           \n\t"
+	"vperm2f128 $0x30, %%ymm9,  %%ymm13, %%ymm13 \n\t"
+	"vperm2f128 $0x12, %%ymm9,  %%ymm7,  %%ymm9  \n\t"
+	"                                            \n\t"
+	"vmovaps           %%ymm14, %%ymm7           \n\t"
+	"vperm2f128 $0x30, %%ymm10, %%ymm14, %%ymm14 \n\t"
+	"vperm2f128 $0x12, %%ymm10, %%ymm7,  %%ymm10 \n\t"
+	"                                            \n\t"
+	"vmovaps           %%ymm12, %%ymm7           \n\t"
+	"vperm2f128 $0x30, %%ymm8,  %%ymm12, %%ymm12 \n\t"
+	"vperm2f128 $0x12, %%ymm8,  %%ymm7,  %%ymm8  \n\t"
+	"                                            \n\t" // ymm15:  ymm13:  ymm11:  ymm9:
+	"                                            \n\t" // ( ab00  ( ab02  ( ab04  ( ab06
+	"                                            \n\t" //   ab10    ab12    ab14    ab16  
+	"                                            \n\t" //   ab20    ab22    ab24    ab26
+	"                                            \n\t" //   ab30    ab32    ab34    ab36
+	"                                            \n\t" //   ab40    ab42    ab44    ab46
+	"                                            \n\t" //   ab50    ab52    ab54    ab56  
+	"                                            \n\t" //   ab60    ab62    ab64    ab66
+	"                                            \n\t" //   ab70 )  ab72 )  ab74 )  ab76 )
+	"                                            \n\t"
+	"                                            \n\t" // ymm14:  ymm12:  ymm10:  ymm8:
+	"                                            \n\t" // ( ab01  ( ab03  ( ab05  ( ab07
+	"                                            \n\t" //   ab11    ab13    ab15    ab17  
+	"                                            \n\t" //   ab21    ab23    ab25    ab27
+	"                                            \n\t" //   ab31    ab33    ab35    ab37
+	"                                            \n\t" //   ab41    ab43    ab45    ab47
+	"                                            \n\t" //   ab51    ab53    ab55    ab57  
+	"                                            \n\t" //   ab61    ab63    ab65    ab67
+	"                                            \n\t" //   ab71 )  ab73 )  ab75 )  ab77 )
+	"                                            \n\t"
+	"movq         %4, %%rax                      \n\t" // load address of alpha
+	"movq         %5, %%rbx                      \n\t" // load address of beta 
+	"vbroadcastss    (%%rax), %%ymm0             \n\t" // load alpha and duplicate
+	"vbroadcastss    (%%rbx), %%ymm4             \n\t" // load beta and duplicate
+	"                                            \n\t"
+	"vmulps           %%ymm0,  %%ymm8,  %%ymm8   \n\t" // scale by alpha
+	"vmulps           %%ymm0,  %%ymm9,  %%ymm9   \n\t"
+	"vmulps           %%ymm0,  %%ymm10, %%ymm10  \n\t"
+	"vmulps           %%ymm0,  %%ymm11, %%ymm11  \n\t"
+	"vmulps           %%ymm0,  %%ymm12, %%ymm12  \n\t"
+	"vmulps           %%ymm0,  %%ymm13, %%ymm13  \n\t"
+	"vmulps           %%ymm0,  %%ymm14, %%ymm14  \n\t"
+	"vmulps           %%ymm0,  %%ymm15, %%ymm15  \n\t"
+	"                                            \n\t"
+	"                                            \n\t"
+	"                                            \n\t"
+	"                                            \n\t"
+	"movq                %7, %%rsi               \n\t" // load rs_c
+	"leaq        (,%%rsi,4), %%rsi               \n\t" // rsi = rs_c * sizeof(float)
+	"                                            \n\t"
+	"leaq   (%%rcx,%%rsi,4), %%rdx               \n\t" // load address of c + 4*rs_c;
+	"                                            \n\t"
+	"leaq        (,%%rsi,2), %%r12               \n\t" // r12 = 2*rs_c;
+	"leaq   (%%r12,%%rsi,1), %%r13               \n\t" // r13 = 3*rs_c;
+	"                                            \n\t"
+	"                                            \n\t"
+	"                                            \n\t" // determine if
+	"                                            \n\t" //    c    % 32 == 0, AND
+	"                                            \n\t" //  4*cs_c % 32 == 0, AND
+	"                                            \n\t" //    rs_c      == 1
+	"                                            \n\t" // ie: aligned, ldim aligned, and
+	"                                            \n\t" // column-stored
+	"                                            \n\t"
+	"cmpq       $4, %%rsi                        \n\t" // set ZF if (4*rs_c) == 4.
+	"sete           %%bl                         \n\t" // bl = ( ZF == 1 ? 1 : 0 );
+	"testq     $31, %%rcx                        \n\t" // set ZF if c & 32 is zero.
+	"setz           %%bh                         \n\t" // bh = ( ZF == 0 ? 1 : 0 );
+	"testq     $31, %%rdi                        \n\t" // set ZF if (4*cs_c) & 32 is zero.
+	"setz           %%al                         \n\t" // al = ( ZF == 0 ? 1 : 0 );
+	"                                            \n\t" // and(bl,bh) followed by
+	"                                            \n\t" // and(bh,al) will reveal result
+	"                                            \n\t"
+	"                                            \n\t" // now avoid loading C if beta == 0
+	"                                            \n\t"
+	"vxorps    %%ymm0,  %%ymm0,  %%ymm0          \n\t" // set ymm0 to zero.
+	"vucomiss  %%xmm0,  %%xmm4                   \n\t" // set ZF if beta == 0.
+	"je      .SBETAZERO                          \n\t" // if ZF = 1, jump to beta == 0 case
+	"                                            \n\t"
+	"                                            \n\t"
+	"                                            \n\t" // check if aligned/column-stored
+	"andb     %%bl, %%bh                         \n\t" // set ZF if bl & bh == 1.
+	"andb     %%bh, %%al                         \n\t" // set ZF if bh & al == 1.
+	"jne     .SCOLSTORED                         \n\t" // jump to column storage case
+	"                                            \n\t"
+	"                                            \n\t"
+	".SGENSTORED:                                \n\t"
+	"                                            \n\t" // update c00:c70
+	"vmovlps    (%%rcx),        %%xmm0,  %%xmm0  \n\t"
+	"vmovhps    (%%rcx,%%rsi),  %%xmm0,  %%xmm0  \n\t"
+	"vmovlps    (%%rcx,%%r12),  %%xmm1,  %%xmm1  \n\t"
+	"vmovhps    (%%rcx,%%r13),  %%xmm1,  %%xmm1  \n\t"
+	"vshufps    $0x88, %%xmm1,  %%xmm0,  %%xmm0  \n\t"
+	"vmovlps    (%%rdx),        %%xmm2,  %%xmm2  \n\t"
+	"vmovhps    (%%rdx,%%rsi),  %%xmm2,  %%xmm2  \n\t"
+	"vmovlps    (%%rdx,%%r12),  %%xmm3,  %%xmm3  \n\t"
+	"vmovhps    (%%rdx,%%r13),  %%xmm3,  %%xmm3  \n\t"
+	"vshufps    $0x88, %%xmm3,  %%xmm2,  %%xmm2  \n\t"
+	"vperm2f128 $0x20, %%ymm2,  %%ymm0,  %%ymm0  \n\t"
+	"                                            \n\t"
+	"vfmaddps	%%ymm15, %%ymm0,  %%ymm4, %%ymm0\n\t"	// scale by beta and add the gemm result,
+	"                                            \n\t"
+	"vextractf128  $1, %%ymm0,  %%xmm2           \n\t"
+	"vmovss            %%xmm0, (%%rcx)           \n\t"
+	"vpermilps  $0x39, %%xmm0,  %%xmm1           \n\t"
+	"vmovss            %%xmm1, (%%rcx,%%rsi)     \n\t"
+	"vpermilps  $0x39, %%xmm1,  %%xmm0           \n\t"
+	"vmovss            %%xmm0, (%%rcx,%%r12)     \n\t"
+	"vpermilps  $0x39, %%xmm0,  %%xmm1           \n\t"
+	"vmovss            %%xmm1, (%%rcx,%%r13)     \n\t"
+	"vmovss            %%xmm2, (%%rdx)           \n\t"
+	"vpermilps  $0x39, %%xmm2,  %%xmm3           \n\t"
+	"vmovss            %%xmm3, (%%rdx,%%rsi)     \n\t"
+	"vpermilps  $0x39, %%xmm3,  %%xmm2           \n\t"
+	"vmovss            %%xmm2, (%%rdx,%%r12)     \n\t"
+	"vpermilps  $0x39, %%xmm2,  %%xmm3           \n\t"
+	"vmovss            %%xmm3, (%%rdx,%%r13)     \n\t"
+	"                                            \n\t"
+	"addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
+	"addq      %%rdi, %%rdx                      \n\t" // c += cs_c;
+	"                                            \n\t"
+	"                                            \n\t" // update c01:c71
+	"vmovlps    (%%rcx),        %%xmm0,  %%xmm0  \n\t"
+	"vmovhps    (%%rcx,%%rsi),  %%xmm0,  %%xmm0  \n\t"
+	"vmovlps    (%%rcx,%%r12),  %%xmm1,  %%xmm1  \n\t"
+	"vmovhps    (%%rcx,%%r13),  %%xmm1,  %%xmm1  \n\t"
+	"vshufps    $0x88, %%xmm1,  %%xmm0,  %%xmm0  \n\t"
+	"vmovlps    (%%rdx),        %%xmm2,  %%xmm2  \n\t"
+	"vmovhps    (%%rdx,%%rsi),  %%xmm2,  %%xmm2  \n\t"
+	"vmovlps    (%%rdx,%%r12),  %%xmm3,  %%xmm3  \n\t"
+	"vmovhps    (%%rdx,%%r13),  %%xmm3,  %%xmm3  \n\t"
+	"vshufps    $0x88, %%xmm3,  %%xmm2,  %%xmm2  \n\t"
+	"vperm2f128 $0x20, %%ymm2,  %%ymm0,  %%ymm0  \n\t"
+	"                                            \n\t"
+	"vmulps            %%ymm4,  %%ymm0,  %%ymm0  \n\t" // scale by beta,
+	"vaddps            %%ymm14, %%ymm0,  %%ymm0  \n\t" // add the gemm result,
+	"                                            \n\t"
+	"vextractf128  $1, %%ymm0,  %%xmm2           \n\t"
+	"vmovss            %%xmm0, (%%rcx)           \n\t"
+	"vpermilps  $0x39, %%xmm0,  %%xmm1           \n\t"
+	"vmovss            %%xmm1, (%%rcx,%%rsi)     \n\t"
+	"vpermilps  $0x39, %%xmm1,  %%xmm0           \n\t"
+	"vmovss            %%xmm0, (%%rcx,%%r12)     \n\t"
+	"vpermilps  $0x39, %%xmm0,  %%xmm1           \n\t"
+	"vmovss            %%xmm1, (%%rcx,%%r13)     \n\t"
+	"vmovss            %%xmm2, (%%rdx)           \n\t"
+	"vpermilps  $0x39, %%xmm2,  %%xmm3           \n\t"
+	"vmovss            %%xmm3, (%%rdx,%%rsi)     \n\t"
+	"vpermilps  $0x39, %%xmm3,  %%xmm2           \n\t"
+	"vmovss            %%xmm2, (%%rdx,%%r12)     \n\t"
+	"vpermilps  $0x39, %%xmm2,  %%xmm3           \n\t"
+	"vmovss            %%xmm3, (%%rdx,%%r13)     \n\t"
+	"                                            \n\t"
+	"addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
+	"addq      %%rdi, %%rdx                      \n\t" // c += cs_c;
+	"                                            \n\t"
+	"                                            \n\t"
+	"                                            \n\t" // update c02:c72
+	"vmovlps    (%%rcx),        %%xmm0,  %%xmm0  \n\t"
+	"vmovhps    (%%rcx,%%rsi),  %%xmm0,  %%xmm0  \n\t"
+	"vmovlps    (%%rcx,%%r12),  %%xmm1,  %%xmm1  \n\t"
+	"vmovhps    (%%rcx,%%r13),  %%xmm1,  %%xmm1  \n\t"
+	"vshufps    $0x88, %%xmm1,  %%xmm0,  %%xmm0  \n\t"
+	"vmovlps    (%%rdx),        %%xmm2,  %%xmm2  \n\t"
+	"vmovhps    (%%rdx,%%rsi),  %%xmm2,  %%xmm2  \n\t"
+	"vmovlps    (%%rdx,%%r12),  %%xmm3,  %%xmm3  \n\t"
+	"vmovhps    (%%rdx,%%r13),  %%xmm3,  %%xmm3  \n\t"
+	"vshufps    $0x88, %%xmm3,  %%xmm2,  %%xmm2  \n\t"
+	"vperm2f128 $0x20, %%ymm2,  %%ymm0,  %%ymm0  \n\t"
+	"                                            \n\t"
+	"vmulps            %%ymm4,  %%ymm0,  %%ymm0  \n\t" // scale by beta,
+	"vaddps            %%ymm13, %%ymm0,  %%ymm0  \n\t" // add the gemm result,
+	"                                            \n\t"
+	"vextractf128  $1, %%ymm0,  %%xmm2           \n\t"
+	"vmovss            %%xmm0, (%%rcx)           \n\t"
+	"vpermilps  $0x39, %%xmm0,  %%xmm1           \n\t"
+	"vmovss            %%xmm1, (%%rcx,%%rsi)     \n\t"
+	"vpermilps  $0x39, %%xmm1,  %%xmm0           \n\t"
+	"vmovss            %%xmm0, (%%rcx,%%r12)     \n\t"
+	"vpermilps  $0x39, %%xmm0,  %%xmm1           \n\t"
+	"vmovss            %%xmm1, (%%rcx,%%r13)     \n\t"
+	"vmovss            %%xmm2, (%%rdx)           \n\t"
+	"vpermilps  $0x39, %%xmm2,  %%xmm3           \n\t"
+	"vmovss            %%xmm3, (%%rdx,%%rsi)     \n\t"
+	"vpermilps  $0x39, %%xmm3,  %%xmm2           \n\t"
+	"vmovss            %%xmm2, (%%rdx,%%r12)     \n\t"
+	"vpermilps  $0x39, %%xmm2,  %%xmm3           \n\t"
+	"vmovss            %%xmm3, (%%rdx,%%r13)     \n\t"
+	"                                            \n\t"
+	"addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
+	"addq      %%rdi, %%rdx                      \n\t" // c += cs_c;
+	"                                            \n\t"
+	"                                            \n\t"
+	"                                            \n\t" // update c03:c73
+	"vmovlps    (%%rcx),        %%xmm0,  %%xmm0  \n\t"
+	"vmovhps    (%%rcx,%%rsi),  %%xmm0,  %%xmm0  \n\t"
+	"vmovlps    (%%rcx,%%r12),  %%xmm1,  %%xmm1  \n\t"
+	"vmovhps    (%%rcx,%%r13),  %%xmm1,  %%xmm1  \n\t"
+	"vshufps    $0x88, %%xmm1,  %%xmm0,  %%xmm0  \n\t"
+	"vmovlps    (%%rdx),        %%xmm2,  %%xmm2  \n\t"
+	"vmovhps    (%%rdx,%%rsi),  %%xmm2,  %%xmm2  \n\t"
+	"vmovlps    (%%rdx,%%r12),  %%xmm3,  %%xmm3  \n\t"
+	"vmovhps    (%%rdx,%%r13),  %%xmm3,  %%xmm3  \n\t"
+	"vshufps    $0x88, %%xmm3,  %%xmm2,  %%xmm2  \n\t"
+	"vperm2f128 $0x20, %%ymm2,  %%ymm0,  %%ymm0  \n\t"
+	"                                            \n\t"
+	"vmulps            %%ymm4,  %%ymm0,  %%ymm0  \n\t" // scale by beta,
+	"vaddps            %%ymm12, %%ymm0,  %%ymm0  \n\t" // add the gemm result,
+	"                                            \n\t"
+	"vextractf128  $1, %%ymm0,  %%xmm2           \n\t"
+	"vmovss            %%xmm0, (%%rcx)           \n\t"
+	"vpermilps  $0x39, %%xmm0,  %%xmm1           \n\t"
+	"vmovss            %%xmm1, (%%rcx,%%rsi)     \n\t"
+	"vpermilps  $0x39, %%xmm1,  %%xmm0           \n\t"
+	"vmovss            %%xmm0, (%%rcx,%%r12)     \n\t"
+	"vpermilps  $0x39, %%xmm0,  %%xmm1           \n\t"
+	"vmovss            %%xmm1, (%%rcx,%%r13)     \n\t"
+	"vmovss            %%xmm2, (%%rdx)           \n\t"
+	"vpermilps  $0x39, %%xmm2,  %%xmm3           \n\t"
+	"vmovss            %%xmm3, (%%rdx,%%rsi)     \n\t"
+	"vpermilps  $0x39, %%xmm3,  %%xmm2           \n\t"
+	"vmovss            %%xmm2, (%%rdx,%%r12)     \n\t"
+	"vpermilps  $0x39, %%xmm2,  %%xmm3           \n\t"
+	"vmovss            %%xmm3, (%%rdx,%%r13)     \n\t"
+	"                                            \n\t"
+	"addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
+	"addq      %%rdi, %%rdx                      \n\t" // c += cs_c;
+	"                                            \n\t"
+	"                                            \n\t"
+	"                                            \n\t" // update c04:c74
+	"vmovlps    (%%rcx),        %%xmm0,  %%xmm0  \n\t"
+	"vmovhps    (%%rcx,%%rsi),  %%xmm0,  %%xmm0  \n\t"
+	"vmovlps    (%%rcx,%%r12),  %%xmm1,  %%xmm1  \n\t"
+	"vmovhps    (%%rcx,%%r13),  %%xmm1,  %%xmm1  \n\t"
+	"vshufps    $0x88, %%xmm1,  %%xmm0,  %%xmm0  \n\t"
+	"vmovlps    (%%rdx),        %%xmm2,  %%xmm2  \n\t"
+	"vmovhps    (%%rdx,%%rsi),  %%xmm2,  %%xmm2  \n\t"
+	"vmovlps    (%%rdx,%%r12),  %%xmm3,  %%xmm3  \n\t"
+	"vmovhps    (%%rdx,%%r13),  %%xmm3,  %%xmm3  \n\t"
+	"vshufps    $0x88, %%xmm3,  %%xmm2,  %%xmm2  \n\t"
+	"vperm2f128 $0x20, %%ymm2,  %%ymm0,  %%ymm0  \n\t"
+	"                                            \n\t"
+	"vmulps            %%ymm4,  %%ymm0,  %%ymm0  \n\t" // scale by beta,
+	"vaddps            %%ymm11, %%ymm0,  %%ymm0  \n\t" // add the gemm result,
+	"                                            \n\t"
+	"vextractf128  $1, %%ymm0,  %%xmm2           \n\t"
+	"vmovss            %%xmm0, (%%rcx)           \n\t"
+	"vpermilps  $0x39, %%xmm0,  %%xmm1           \n\t"
+	"vmovss            %%xmm1, (%%rcx,%%rsi)     \n\t"
+	"vpermilps  $0x39, %%xmm1,  %%xmm0           \n\t"
+	"vmovss            %%xmm0, (%%rcx,%%r12)     \n\t"
+	"vpermilps  $0x39, %%xmm0,  %%xmm1           \n\t"
+	"vmovss            %%xmm1, (%%rcx,%%r13)     \n\t"
+	"vmovss            %%xmm2, (%%rdx)           \n\t"
+	"vpermilps  $0x39, %%xmm2,  %%xmm3           \n\t"
+	"vmovss            %%xmm3, (%%rdx,%%rsi)     \n\t"
+	"vpermilps  $0x39, %%xmm3,  %%xmm2           \n\t"
+	"vmovss            %%xmm2, (%%rdx,%%r12)     \n\t"
+	"vpermilps  $0x39, %%xmm2,  %%xmm3           \n\t"
+	"vmovss            %%xmm3, (%%rdx,%%r13)     \n\t"
+	"                                            \n\t"
+	"addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
+	"addq      %%rdi, %%rdx                      \n\t" // c += cs_c;
+	"                                            \n\t"
+	"                                            \n\t"
+	"                                            \n\t" // update c05:c75
+	"vmovlps    (%%rcx),        %%xmm0,  %%xmm0  \n\t"
+	"vmovhps    (%%rcx,%%rsi),  %%xmm0,  %%xmm0  \n\t"
+	"vmovlps    (%%rcx,%%r12),  %%xmm1,  %%xmm1  \n\t"
+	"vmovhps    (%%rcx,%%r13),  %%xmm1,  %%xmm1  \n\t"
+	"vshufps    $0x88, %%xmm1,  %%xmm0,  %%xmm0  \n\t"
+	"vmovlps    (%%rdx),        %%xmm2,  %%xmm2  \n\t"
+	"vmovhps    (%%rdx,%%rsi),  %%xmm2,  %%xmm2  \n\t"
+	"vmovlps    (%%rdx,%%r12),  %%xmm3,  %%xmm3  \n\t"
+	"vmovhps    (%%rdx,%%r13),  %%xmm3,  %%xmm3  \n\t"
+	"vshufps    $0x88, %%xmm3,  %%xmm2,  %%xmm2  \n\t"
+	"vperm2f128 $0x20, %%ymm2,  %%ymm0,  %%ymm0  \n\t"
+	"                                            \n\t"
+	"vmulps            %%ymm4,  %%ymm0,  %%ymm0  \n\t" // scale by beta,
+	"vaddps            %%ymm10, %%ymm0,  %%ymm0  \n\t" // add the gemm result,
+	"                                            \n\t"
+	"vextractf128  $1, %%ymm0,  %%xmm2           \n\t"
+	"vmovss            %%xmm0, (%%rcx)           \n\t"
+	"vpermilps  $0x39, %%xmm0,  %%xmm1           \n\t"
+	"vmovss            %%xmm1, (%%rcx,%%rsi)     \n\t"
+	"vpermilps  $0x39, %%xmm1,  %%xmm0           \n\t"
+	"vmovss            %%xmm0, (%%rcx,%%r12)     \n\t"
+	"vpermilps  $0x39, %%xmm0,  %%xmm1           \n\t"
+	"vmovss            %%xmm1, (%%rcx,%%r13)     \n\t"
+	"vmovss            %%xmm2, (%%rdx)           \n\t"
+	"vpermilps  $0x39, %%xmm2,  %%xmm3           \n\t"
+	"vmovss            %%xmm3, (%%rdx,%%rsi)     \n\t"
+	"vpermilps  $0x39, %%xmm3,  %%xmm2           \n\t"
+	"vmovss            %%xmm2, (%%rdx,%%r12)     \n\t"
+	"vpermilps  $0x39, %%xmm2,  %%xmm3           \n\t"
+	"vmovss            %%xmm3, (%%rdx,%%r13)     \n\t"
+	"                                            \n\t"
+	"addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
+	"addq      %%rdi, %%rdx                      \n\t" // c += cs_c;
+	"                                            \n\t"
+	"                                            \n\t"
+	"                                            \n\t" // update c06:c76
+	"vmovlps    (%%rcx),        %%xmm0,  %%xmm0  \n\t"
+	"vmovhps    (%%rcx,%%rsi),  %%xmm0,  %%xmm0  \n\t"
+	"vmovlps    (%%rcx,%%r12),  %%xmm1,  %%xmm1  \n\t"
+	"vmovhps    (%%rcx,%%r13),  %%xmm1,  %%xmm1  \n\t"
+	"vshufps    $0x88, %%xmm1,  %%xmm0,  %%xmm0  \n\t"
+	"vmovlps    (%%rdx),        %%xmm2,  %%xmm2  \n\t"
+	"vmovhps    (%%rdx,%%rsi),  %%xmm2,  %%xmm2  \n\t"
+	"vmovlps    (%%rdx,%%r12),  %%xmm3,  %%xmm3  \n\t"
+	"vmovhps    (%%rdx,%%r13),  %%xmm3,  %%xmm3  \n\t"
+	"vshufps    $0x88, %%xmm3,  %%xmm2,  %%xmm2  \n\t"
+	"vperm2f128 $0x20, %%ymm2,  %%ymm0,  %%ymm0  \n\t"
+	"                                            \n\t"
+	"vmulps            %%ymm4,  %%ymm0,  %%ymm0  \n\t" // scale by beta,
+	"vaddps            %%ymm9,  %%ymm0,  %%ymm0  \n\t" // add the gemm result,
+	"                                            \n\t"
+	"vextractf128  $1, %%ymm0,  %%xmm2           \n\t"
+	"vmovss            %%xmm0, (%%rcx)           \n\t"
+	"vpermilps  $0x39, %%xmm0,  %%xmm1           \n\t"
+	"vmovss            %%xmm1, (%%rcx,%%rsi)     \n\t"
+	"vpermilps  $0x39, %%xmm1,  %%xmm0           \n\t"
+	"vmovss            %%xmm0, (%%rcx,%%r12)     \n\t"
+	"vpermilps  $0x39, %%xmm0,  %%xmm1           \n\t"
+	"vmovss            %%xmm1, (%%rcx,%%r13)     \n\t"
+	"vmovss            %%xmm2, (%%rdx)           \n\t"
+	"vpermilps  $0x39, %%xmm2,  %%xmm3           \n\t"
+	"vmovss            %%xmm3, (%%rdx,%%rsi)     \n\t"
+	"vpermilps  $0x39, %%xmm3,  %%xmm2           \n\t"
+	"vmovss            %%xmm2, (%%rdx,%%r12)     \n\t"
+	"vpermilps  $0x39, %%xmm2,  %%xmm3           \n\t"
+	"vmovss            %%xmm3, (%%rdx,%%r13)     \n\t"
+	"                                            \n\t"
+	"addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
+	"addq      %%rdi, %%rdx                      \n\t" // c += cs_c;
+	"                                            \n\t"
+	"                                            \n\t"
+	"                                            \n\t" // update c07:c77
+	"vmovlps    (%%rcx),        %%xmm0,  %%xmm0  \n\t"
+	"vmovhps    (%%rcx,%%rsi),  %%xmm0,  %%xmm0  \n\t"
+	"vmovlps    (%%rcx,%%r12),  %%xmm1,  %%xmm1  \n\t"
+	"vmovhps    (%%rcx,%%r13),  %%xmm1,  %%xmm1  \n\t"
+	"vshufps    $0x88, %%xmm1,  %%xmm0,  %%xmm0  \n\t"
+	"vmovlps    (%%rdx),        %%xmm2,  %%xmm2  \n\t"
+	"vmovhps    (%%rdx,%%rsi),  %%xmm2,  %%xmm2  \n\t"
+	"vmovlps    (%%rdx,%%r12),  %%xmm3,  %%xmm3  \n\t"
+	"vmovhps    (%%rdx,%%r13),  %%xmm3,  %%xmm3  \n\t"
+	"vshufps    $0x88, %%xmm3,  %%xmm2,  %%xmm2  \n\t"
+	"vperm2f128 $0x20, %%ymm2,  %%ymm0,  %%ymm0  \n\t"
+	"                                            \n\t"
+	"vmulps            %%ymm4,  %%ymm0,  %%ymm0  \n\t" // scale by beta,
+	"vaddps            %%ymm8,  %%ymm0,  %%ymm0  \n\t" // add the gemm result,
+	"                                            \n\t"
+	"vextractf128  $1, %%ymm0,  %%xmm2           \n\t"
+	"vmovss            %%xmm0, (%%rcx)           \n\t"
+	"vpermilps  $0x39, %%xmm0,  %%xmm1           \n\t"
+	"vmovss            %%xmm1, (%%rcx,%%rsi)     \n\t"
+	"vpermilps  $0x39, %%xmm1,  %%xmm0           \n\t"
+	"vmovss            %%xmm0, (%%rcx,%%r12)     \n\t"
+	"vpermilps  $0x39, %%xmm0,  %%xmm1           \n\t"
+	"vmovss            %%xmm1, (%%rcx,%%r13)     \n\t"
+	"vmovss            %%xmm2, (%%rdx)           \n\t"
+	"vpermilps  $0x39, %%xmm2,  %%xmm3           \n\t"
+	"vmovss            %%xmm3, (%%rdx,%%rsi)     \n\t"
+	"vpermilps  $0x39, %%xmm3,  %%xmm2           \n\t"
+	"vmovss            %%xmm2, (%%rdx,%%r12)     \n\t"
+	"vpermilps  $0x39, %%xmm2,  %%xmm3           \n\t"
+	"vmovss            %%xmm3, (%%rdx,%%r13)     \n\t"
+	"                                            \n\t"
+	"                                            \n\t"
+	"                                            \n\t"
+	"jmp    .SDONE                               \n\t" // jump to end.
+	"                                            \n\t"
+	"                                            \n\t"
+	"                                            \n\t"
+	".SCOLSTORED:                                \n\t"
+	"                                            \n\t"
+	"                                            \n\t"
+	"vmovaps    (%%rcx),       %%ymm0            \n\t" // load c00:c70,
+	"vmulps           %%ymm4,  %%ymm0,  %%ymm0   \n\t" // scale by beta,
+	"vaddps           %%ymm15, %%ymm0,  %%ymm0   \n\t" // add the gemm result,
+	"vmovaps          %%ymm0,  (%%rcx)           \n\t" // and store back to memory.
+	"addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
+	"                                            \n\t"
+	"vmovaps    (%%rcx),       %%ymm1            \n\t" // load c01:c71,
+	"vmulps           %%ymm4,  %%ymm1,  %%ymm1   \n\t" // scale by beta,
+	"vaddps           %%ymm14, %%ymm1,  %%ymm1   \n\t" // add the gemm result,
+	"vmovaps          %%ymm1,  (%%rcx)           \n\t" // and store back to memory.
+	"addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
+	"                                            \n\t"
+	"vmovaps    (%%rcx),       %%ymm0            \n\t" // load c02:c72,
+	"vmulps           %%ymm4,  %%ymm0,  %%ymm0   \n\t" // scale by beta,
+	"vaddps           %%ymm13, %%ymm0,  %%ymm0   \n\t" // add the gemm result,
+	"vmovaps          %%ymm0,  (%%rcx)           \n\t" // and store back to memory.
+	"addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
+	"                                            \n\t"
+	"vmovaps    (%%rcx),       %%ymm1            \n\t" // load c03:c73,
+	"vmulps           %%ymm4,  %%ymm1,  %%ymm1   \n\t" // scale by beta,
+	"vaddps           %%ymm12, %%ymm1,  %%ymm1   \n\t" // add the gemm result,
+	"vmovaps          %%ymm1,  (%%rcx)           \n\t" // and store back to memory.
+	"addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
+	"                                            \n\t"
+	"vmovaps    (%%rcx),       %%ymm0            \n\t" // load c04:c74,
+	"vmulps           %%ymm4,  %%ymm0,  %%ymm0   \n\t" // scale by beta,
+	"vaddps           %%ymm11, %%ymm0,  %%ymm0   \n\t" // add the gemm result,
+	"vmovaps          %%ymm0,  (%%rcx)           \n\t" // and store back to memory.
+	"addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
+	"                                            \n\t"
+	"vmovaps    (%%rcx),       %%ymm1            \n\t" // load c05:c75,
+	"vmulps           %%ymm4,  %%ymm1,  %%ymm1   \n\t" // scale by beta,
+	"vaddps           %%ymm10, %%ymm1,  %%ymm1   \n\t" // add the gemm result,
+	"vmovaps          %%ymm1,  (%%rcx)           \n\t" // and store back to memory.
+	"addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
+	"                                            \n\t"
+	"vmovaps    (%%rcx),       %%ymm0            \n\t" // load c06:c76,
+	"vmulps           %%ymm4,  %%ymm0,  %%ymm0   \n\t" // scale by beta,
+	"vaddps           %%ymm9,  %%ymm0,  %%ymm0   \n\t" // add the gemm result,
+	"vmovaps          %%ymm0,  (%%rcx)           \n\t" // and store back to memory.
+	"addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
+	"                                            \n\t"
+	"vmovaps    (%%rcx),       %%ymm1            \n\t" // load c07:c77,
+	"vmulps           %%ymm4,  %%ymm1,  %%ymm1   \n\t" // scale by beta,
+	"vaddps           %%ymm8,  %%ymm1,  %%ymm1   \n\t" // add the gemm result,
+	"vmovaps          %%ymm1,  (%%rcx)           \n\t" // and store back to memory.
+	"                                            \n\t"
+	"                                            \n\t"
+	"jmp    .SDONE                               \n\t" // jump to end.
+	"                                            \n\t"
+	"                                            \n\t"
+	"                                            \n\t"
+	"                                            \n\t"
+	".SBETAZERO:                                 \n\t"
+	"                                            \n\t" // check if aligned/column-stored
+	"andb     %%bl, %%bh                         \n\t" // set ZF if bl & bh == 1.
+	"andb     %%bh, %%al                         \n\t" // set ZF if bh & al == 1.
+	"jne     .SCOLSTORBZ                         \n\t" // jump to column storage case
+	"                                            \n\t"
+	"                                            \n\t"
+	"                                            \n\t"
+	".SGENSTORBZ:                                \n\t"
+	"                                            \n\t"
+	"                                            \n\t" // update c00:c70
+	"vmovapd           %%ymm15, %%ymm0           \n\t"
+	"vextractf128  $1, %%ymm0,  %%xmm2           \n\t"
+	"vmovss            %%xmm0, (%%rcx)           \n\t"
+	"vpermilps  $0x39, %%xmm0,  %%xmm1           \n\t"
+	"vmovss            %%xmm1, (%%rcx,%%rsi)     \n\t"
+	"vpermilps  $0x39, %%xmm1,  %%xmm0           \n\t"
+	"vmovss            %%xmm0, (%%rcx,%%r12)     \n\t"
+	"vpermilps  $0x39, %%xmm0,  %%xmm1           \n\t"
+	"vmovss            %%xmm1, (%%rcx,%%r13)     \n\t"
+	"vmovss            %%xmm2, (%%rdx)           \n\t"
+	"vpermilps  $0x39, %%xmm2,  %%xmm3           \n\t"
+	"vmovss            %%xmm3, (%%rdx,%%rsi)     \n\t"
+	"vpermilps  $0x39, %%xmm3,  %%xmm2           \n\t"
+	"vmovss            %%xmm2, (%%rdx,%%r12)     \n\t"
+	"vpermilps  $0x39, %%xmm2,  %%xmm3           \n\t"
+	"vmovss            %%xmm3, (%%rdx,%%r13)     \n\t"
+	"                                            \n\t"
+	"addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
+	"addq      %%rdi, %%rdx                      \n\t" // c += cs_c;
+	"                                            \n\t"
+	"                                            \n\t"
+	"                                            \n\t" // update c01:c71
+	"vmovapd           %%ymm14, %%ymm0           \n\t"
+	"vextractf128  $1, %%ymm0,  %%xmm2           \n\t"
+	"vmovss            %%xmm0, (%%rcx)           \n\t"
+	"vpermilps  $0x39, %%xmm0,  %%xmm1           \n\t"
+	"vmovss            %%xmm1, (%%rcx,%%rsi)     \n\t"
+	"vpermilps  $0x39, %%xmm1,  %%xmm0           \n\t"
+	"vmovss            %%xmm0, (%%rcx,%%r12)     \n\t"
+	"vpermilps  $0x39, %%xmm0,  %%xmm1           \n\t"
+	"vmovss            %%xmm1, (%%rcx,%%r13)     \n\t"
+	"vmovss            %%xmm2, (%%rdx)           \n\t"
+	"vpermilps  $0x39, %%xmm2,  %%xmm3           \n\t"
+	"vmovss            %%xmm3, (%%rdx,%%rsi)     \n\t"
+	"vpermilps  $0x39, %%xmm3,  %%xmm2           \n\t"
+	"vmovss            %%xmm2, (%%rdx,%%r12)     \n\t"
+	"vpermilps  $0x39, %%xmm2,  %%xmm3           \n\t"
+	"vmovss            %%xmm3, (%%rdx,%%r13)     \n\t"
+	"                                            \n\t"
+	"addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
+	"addq      %%rdi, %%rdx                      \n\t" // c += cs_c;
+	"                                            \n\t"
+	"                                            \n\t"
+	"                                            \n\t" // update c02:c72
+	"vmovapd           %%ymm13, %%ymm0           \n\t"
+	"vextractf128  $1, %%ymm0,  %%xmm2           \n\t"
+	"vmovss            %%xmm0, (%%rcx)           \n\t"
+	"vpermilps  $0x39, %%xmm0,  %%xmm1           \n\t"
+	"vmovss            %%xmm1, (%%rcx,%%rsi)     \n\t"
+	"vpermilps  $0x39, %%xmm1,  %%xmm0           \n\t"
+	"vmovss            %%xmm0, (%%rcx,%%r12)     \n\t"
+	"vpermilps  $0x39, %%xmm0,  %%xmm1           \n\t"
+	"vmovss            %%xmm1, (%%rcx,%%r13)     \n\t"
+	"vmovss            %%xmm2, (%%rdx)           \n\t"
+	"vpermilps  $0x39, %%xmm2,  %%xmm3           \n\t"
+	"vmovss            %%xmm3, (%%rdx,%%rsi)     \n\t"
+	"vpermilps  $0x39, %%xmm3,  %%xmm2           \n\t"
+	"vmovss            %%xmm2, (%%rdx,%%r12)     \n\t"
+	"vpermilps  $0x39, %%xmm2,  %%xmm3           \n\t"
+	"vmovss            %%xmm3, (%%rdx,%%r13)     \n\t"
+	"                                            \n\t"
+	"addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
+	"addq      %%rdi, %%rdx                      \n\t" // c += cs_c;
+	"                                            \n\t"
+	"                                            \n\t"
+	"                                            \n\t" // update c03:c73
+	"vmovapd           %%ymm12, %%ymm0           \n\t"
+	"vextractf128  $1, %%ymm0,  %%xmm2           \n\t"
+	"vmovss            %%xmm0, (%%rcx)           \n\t"
+	"vpermilps  $0x39, %%xmm0,  %%xmm1           \n\t"
+	"vmovss            %%xmm1, (%%rcx,%%rsi)     \n\t"
+	"vpermilps  $0x39, %%xmm1,  %%xmm0           \n\t"
+	"vmovss            %%xmm0, (%%rcx,%%r12)     \n\t"
+	"vpermilps  $0x39, %%xmm0,  %%xmm1           \n\t"
+	"vmovss            %%xmm1, (%%rcx,%%r13)     \n\t"
+	"vmovss            %%xmm2, (%%rdx)           \n\t"
+	"vpermilps  $0x39, %%xmm2,  %%xmm3           \n\t"
+	"vmovss            %%xmm3, (%%rdx,%%rsi)     \n\t"
+	"vpermilps  $0x39, %%xmm3,  %%xmm2           \n\t"
+	"vmovss            %%xmm2, (%%rdx,%%r12)     \n\t"
+	"vpermilps  $0x39, %%xmm2,  %%xmm3           \n\t"
+	"vmovss            %%xmm3, (%%rdx,%%r13)     \n\t"
+	"                                            \n\t"
+	"addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
+	"addq      %%rdi, %%rdx                      \n\t" // c += cs_c;
+	"                                            \n\t"
+	"                                            \n\t"
+	"                                            \n\t" // update c04:c74
+	"vmovapd           %%ymm11, %%ymm0           \n\t"
+	"vextractf128  $1, %%ymm0,  %%xmm2           \n\t"
+	"vmovss            %%xmm0, (%%rcx)           \n\t"
+	"vpermilps  $0x39, %%xmm0,  %%xmm1           \n\t"
+	"vmovss            %%xmm1, (%%rcx,%%rsi)     \n\t"
+	"vpermilps  $0x39, %%xmm1,  %%xmm0           \n\t"
+	"vmovss            %%xmm0, (%%rcx,%%r12)     \n\t"
+	"vpermilps  $0x39, %%xmm0,  %%xmm1           \n\t"
+	"vmovss            %%xmm1, (%%rcx,%%r13)     \n\t"
+	"vmovss            %%xmm2, (%%rdx)           \n\t"
+	"vpermilps  $0x39, %%xmm2,  %%xmm3           \n\t"
+	"vmovss            %%xmm3, (%%rdx,%%rsi)     \n\t"
+	"vpermilps  $0x39, %%xmm3,  %%xmm2           \n\t"
+	"vmovss            %%xmm2, (%%rdx,%%r12)     \n\t"
+	"vpermilps  $0x39, %%xmm2,  %%xmm3           \n\t"
+	"vmovss            %%xmm3, (%%rdx,%%r13)     \n\t"
+	"                                            \n\t"
+	"addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
+	"addq      %%rdi, %%rdx                      \n\t" // c += cs_c;
+	"                                            \n\t"
+	"                                            \n\t"
+	"                                            \n\t" // update c05:c75
+	"vmovapd           %%ymm10, %%ymm0           \n\t"
+	"vextractf128  $1, %%ymm0,  %%xmm2           \n\t"
+	"vmovss            %%xmm0, (%%rcx)           \n\t"
+	"vpermilps  $0x39, %%xmm0,  %%xmm1           \n\t"
+	"vmovss            %%xmm1, (%%rcx,%%rsi)     \n\t"
+	"vpermilps  $0x39, %%xmm1,  %%xmm0           \n\t"
+	"vmovss            %%xmm0, (%%rcx,%%r12)     \n\t"
+	"vpermilps  $0x39, %%xmm0,  %%xmm1           \n\t"
+	"vmovss            %%xmm1, (%%rcx,%%r13)     \n\t"
+	"vmovss            %%xmm2, (%%rdx)           \n\t"
+	"vpermilps  $0x39, %%xmm2,  %%xmm3           \n\t"
+	"vmovss            %%xmm3, (%%rdx,%%rsi)     \n\t"
+	"vpermilps  $0x39, %%xmm3,  %%xmm2           \n\t"
+	"vmovss            %%xmm2, (%%rdx,%%r12)     \n\t"
+	"vpermilps  $0x39, %%xmm2,  %%xmm3           \n\t"
+	"vmovss            %%xmm3, (%%rdx,%%r13)     \n\t"
+	"                                            \n\t"
+	"addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
+	"addq      %%rdi, %%rdx                      \n\t" // c += cs_c;
+	"                                            \n\t"
+	"                                            \n\t"
+	"                                            \n\t" // update c06:c76
+	"vmovapd           %%ymm9,  %%ymm0           \n\t"
+	"vextractf128  $1, %%ymm0,  %%xmm2           \n\t"
+	"vmovss            %%xmm0, (%%rcx)           \n\t"
+	"vpermilps  $0x39, %%xmm0,  %%xmm1           \n\t"
+	"vmovss            %%xmm1, (%%rcx,%%rsi)     \n\t"
+	"vpermilps  $0x39, %%xmm1,  %%xmm0           \n\t"
+	"vmovss            %%xmm0, (%%rcx,%%r12)     \n\t"
+	"vpermilps  $0x39, %%xmm0,  %%xmm1           \n\t"
+	"vmovss            %%xmm1, (%%rcx,%%r13)     \n\t"
+	"vmovss            %%xmm2, (%%rdx)           \n\t"
+	"vpermilps  $0x39, %%xmm2,  %%xmm3           \n\t"
+	"vmovss            %%xmm3, (%%rdx,%%rsi)     \n\t"
+	"vpermilps  $0x39, %%xmm3,  %%xmm2           \n\t"
+	"vmovss            %%xmm2, (%%rdx,%%r12)     \n\t"
+	"vpermilps  $0x39, %%xmm2,  %%xmm3           \n\t"
+	"vmovss            %%xmm3, (%%rdx,%%r13)     \n\t"
+	"                                            \n\t"
+	"addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
+	"addq      %%rdi, %%rdx                      \n\t" // c += cs_c;
+	"                                            \n\t"
+	"                                            \n\t"
+	"                                            \n\t" // update c07:c77
+	"vmovapd           %%ymm8,  %%ymm0           \n\t"
+	"vextractf128  $1, %%ymm0,  %%xmm2           \n\t"
+	"vmovss            %%xmm0, (%%rcx)           \n\t"
+	"vpermilps  $0x39, %%xmm0,  %%xmm1           \n\t"
+	"vmovss            %%xmm1, (%%rcx,%%rsi)     \n\t"
+	"vpermilps  $0x39, %%xmm1,  %%xmm0           \n\t"
+	"vmovss            %%xmm0, (%%rcx,%%r12)     \n\t"
+	"vpermilps  $0x39, %%xmm0,  %%xmm1           \n\t"
+	"vmovss            %%xmm1, (%%rcx,%%r13)     \n\t"
+	"vmovss            %%xmm2, (%%rdx)           \n\t"
+	"vpermilps  $0x39, %%xmm2,  %%xmm3           \n\t"
+	"vmovss            %%xmm3, (%%rdx,%%rsi)     \n\t"
+	"vpermilps  $0x39, %%xmm3,  %%xmm2           \n\t"
+	"vmovss            %%xmm2, (%%rdx,%%r12)     \n\t"
+	"vpermilps  $0x39, %%xmm2,  %%xmm3           \n\t"
+	"vmovss            %%xmm3, (%%rdx,%%r13)     \n\t"
+	"                                            \n\t"
+	"                                            \n\t"
+	"jmp    .SDONE                               \n\t" // jump to end.
+	"                                            \n\t"
+	"                                            \n\t"
+	"                                            \n\t"
+	".SCOLSTORBZ:                                \n\t"
+	"                                            \n\t"
+	"                                            \n\t"
+	"vmovaps          %%ymm15, (%%rcx)           \n\t" // and store back to memory.
+	"addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
+	"                                            \n\t"
+	"vmovaps          %%ymm14, (%%rcx)           \n\t" // and store back to memory.
+	"addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
+	"                                            \n\t"
+	"vmovaps          %%ymm13, (%%rcx)           \n\t" // and store back to memory.
+	"addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
+	"                                            \n\t"
+	"vmovaps          %%ymm12, (%%rcx)           \n\t" // and store back to memory.
+	"addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
+	"                                            \n\t"
+	"vmovaps          %%ymm11, (%%rcx)           \n\t" // and store back to memory.
+	"addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
+	"                                            \n\t"
+	"vmovaps          %%ymm10, (%%rcx)           \n\t" // and store back to memory.
+	"addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
+	"                                            \n\t"
+	"vmovaps          %%ymm9,  (%%rcx)           \n\t" // and store back to memory.
+	"addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
+	"                                            \n\t"
+	"vmovaps          %%ymm8,  (%%rcx)           \n\t" // and store back to memory.
+	"                                            \n\t"
+	"                                            \n\t"
+	"                                            \n\t"
+	"                                            \n\t"
+	"                                            \n\t"
+	".SDONE:                                     \n\t"
+	"                                            \n\t"
+
+	: // output operands (none)
+	: // input operands
+	  "m" (k_iter), // 0
+	  "m" (k_left), // 1
+	  "m" (a),      // 2
+	  "m" (b),      // 3
+	  "m" (alpha),  // 4
+	  "m" (beta),   // 5
+	  "m" (c),      // 6
+	  "m" (rs_c),   // 7
+	  "m" (cs_c)/*,   // 8
+	  "m" (b_next), // 9
+	  "m" (a_next)*/  // 10
+	: // register clobber list
+	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", 
+	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
+	  "xmm0", "xmm1", "xmm2", "xmm3",
+	  "xmm4", "xmm5", "xmm6", "xmm7",
+	  "xmm8", "xmm9", "xmm10", "xmm11",
+	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "memory"
+	);
+}
+
 #undef KERNEL4x6_1
 #undef KERNEL4x6_2
 #undef KERNEL4x6_3

From af92773f4f85a2441fe0c6e3a52c31b07253d08e Mon Sep 17 00:00:00 2001
From: figual <figual@ucm.es>
Date: Wed, 23 Mar 2016 22:07:02 +0100
Subject: [PATCH 03/10] Updated and improved ARMv8 micro-kernels.

---
 config/armv8a/bli_kernel.h               |   26 +-
 kernels/armv8a/neon/3/bli_gemm_opt_4x4.c | 2109 +++++++++++++++++-----
 2 files changed, 1701 insertions(+), 434 deletions(-)

diff --git a/config/armv8a/bli_kernel.h b/config/armv8a/bli_kernel.h
index 3bd7da722..38eaef60d 100644
--- a/config/armv8a/bli_kernel.h
+++ b/config/armv8a/bli_kernel.h
@@ -51,13 +51,13 @@
 //     (b) MR (for zero-padding purposes when MR and NR are "swapped")
 //
 
-#define BLIS_DEFAULT_MC_S              336
-#define BLIS_DEFAULT_KC_S              336
-#define BLIS_DEFAULT_NC_S              4096
+#define BLIS_DEFAULT_MC_S              120 //1536 //336 //416 // 1280 //160 // 160 // 160 //2048 //336 
+#define BLIS_DEFAULT_KC_S              640 //1536 //336 //704 //1280 //672 //528 // 856 //2048 //528 
+#define BLIS_DEFAULT_NC_S              3072
 
-#define BLIS_DEFAULT_MC_D              160
-#define BLIS_DEFAULT_KC_D              304
-#define BLIS_DEFAULT_NC_D              4096
+#define BLIS_DEFAULT_MC_D              120 //1536 //160 //80 //176 
+#define BLIS_DEFAULT_KC_D              240 //1536 //304 //336 //368 
+#define BLIS_DEFAULT_NC_D              3072
 
 #define BLIS_DEFAULT_MC_C              64
 #define BLIS_DEFAULT_KC_C              128
@@ -69,11 +69,11 @@
 
 // -- Register blocksizes --
 
-#define BLIS_DEFAULT_MR_S              4
-#define BLIS_DEFAULT_NR_S              4
+#define BLIS_DEFAULT_MR_S              8
+#define BLIS_DEFAULT_NR_S              12
 
-#define BLIS_DEFAULT_MR_D              4
-#define BLIS_DEFAULT_NR_D              4
+#define BLIS_DEFAULT_MR_D              6
+#define BLIS_DEFAULT_NR_D              8
 
 #define BLIS_DEFAULT_MR_C              8
 #define BLIS_DEFAULT_NR_C              4
@@ -132,6 +132,8 @@
 //#define BLIS_PACKDIM_MR_Z              (BLIS_DEFAULT_MR_Z + ...)
 //#define BLIS_PACKDIM_NR_Z              (BLIS_DEFAULT_NR_Z + ...)
 
+
+
 // -- LEVEL-2 KERNEL CONSTANTS -------------------------------------------------
 
 
@@ -146,8 +148,8 @@
 
 // -- gemm --
 
-#define BLIS_SGEMM_UKERNEL         bli_sgemm_opt_4x4
-#define BLIS_DGEMM_UKERNEL         bli_dgemm_opt_4x4
+#define BLIS_SGEMM_UKERNEL         bli_sgemm_opt_8x12
+#define BLIS_DGEMM_UKERNEL         bli_dgemm_opt_6x8
 
 // -- trsm-related --
 
diff --git a/kernels/armv8a/neon/3/bli_gemm_opt_4x4.c b/kernels/armv8a/neon/3/bli_gemm_opt_4x4.c
index 2a54fe825..e010d188f 100644
--- a/kernels/armv8a/neon/3/bli_gemm_opt_4x4.c
+++ b/kernels/armv8a/neon/3/bli_gemm_opt_4x4.c
@@ -36,9 +36,21 @@
 #include "blis.h"
 
 /*
+   o 4x4 Single precision micro-kernel fully functional.
+   o Runnable on ARMv8, compiled with aarch64 GCC.
+   o Use it together with the armv8 BLIS configuration.
    o Tested on Juno board. Around 7.3 GFLOPS @ 1.1 GHz. 
+
+   December 2014.
+ 
+ * UPDATE NOVEMBER 2015
+ * Micro-kernel changed to 8x12
+ * Tested on Juno Board. Around  8.1 GFLOPS, 1 x A57 core  @ 1.1 GHz.
+ * Tested on Juno Board. Around 15.9 GFLOPS, 2 x A57 cores @ 1.1 GHz.
+ * Tested on Juno board. Around  3.1 GFLOPS, 1 x A53 core  @ 850 MHz. 
+ * Tested on Juno board. Around 12   GFLOPS, 4 x A53 cores @ 850 MHz.
 */
-void bli_sgemm_opt_4x4(
+void bli_sgemm_opt_8x12(
                         dim_t              k,
                         float*    restrict alpha,
                         float*    restrict a,
@@ -50,9 +62,9 @@ void bli_sgemm_opt_4x4(
 {
 	void* a_next = bli_auxinfo_next_a( data );
 	void* b_next = bli_auxinfo_next_b( data );
-        
+
 	dim_t k_iter = k / 4;
-	dim_t k_left = k % 4; 
+	dim_t k_left = k % 4;
 
 __asm__ volatile 
 (
@@ -62,10 +74,8 @@ __asm__ volatile
 " ldr x1,%[baddr]                            \n\t" // Load address of B.
 " ldr x2,%[caddr]                            \n\t" // Load address of C.
 "                                            \n\t"
-" mov x4,#1                                  \n\t" // Init loop counter (i=0).
-"                                            \n\t"
-" ldr x16,%[a_next]                          \n\t" // Pointer to next block of A.
-" ldr x17,%[b_next]                          \n\t" // Pointer to next pointer of B.
+" ldr x3,%[a_next]                           \n\t" // Pointer to next block of A.
+" ldr x4,%[b_next]                           \n\t" // Pointer to next pointer of B.
 "                                            \n\t"
 " ldr x5,%[k_iter]                           \n\t" // Number of unrolled iterations (k_iter).
 " ldr x6,%[k_left]                           \n\t" // Number of remaining iterations (k_left).
@@ -75,157 +85,367 @@ __asm__ volatile
 "                                            \n\t" 
 " ldr x9,%[cs_c]                             \n\t" // Load cs_c.
 " lsl x10,x9,#2                              \n\t" // cs_c * sizeof(float) -- AUX.
-" lsl x11,x9,#3                              \n\t" // 2 * cs_c * sizeof(float) -- AUX.
-" lsl x12,x9,#4                              \n\t" // 3 * cs_c * sizeof(float) -- AUX.
 "                                            \n\t" 
 " ldr x13,%[rs_c]                            \n\t" // Load rs_c.
 " lsl x14,x13,#2                             \n\t" // rs_c * sizeof(float).
-"                                            \n\t" 
-" ldp q0,q1,[x0,0]                           \n\t" // Preload columns a,a+1 into two quads.
-" ldp q4,q5,[x1,0]                           \n\t" // Preload rows    b,b+1 into two quads.
 "                                            \n\t"
-" prfm pldl1keep,[x2,0]                      \n\t" // Prefetch c.
-" prfm pldl1keep,[x2,x10]                    \n\t" // Prefetch c.
-" prfm pldl1keep,[x2,x11]                    \n\t" // Prefetch c.
-" prfm pldl1keep,[x2,x12]                    \n\t" // Prefetch c.
+" add x16,x2,x10                             \n\t" //Load address Column 1 of C
+" add x17,x16,x10                            \n\t" //Load address Column 2 of C
+" add x18,x17,x10                            \n\t" //Load address Column 3 of C
+" add x19,x18,x10                            \n\t" //Load address Column 4 of C
+" add x20,x19,x10                            \n\t" //Load address Column 5 of C
+" add x21,x20,x10                            \n\t" //Load address Column 6 of C
+" add x22,x21,x10                            \n\t" //Load address Column 7 of C
+" add x23,x22,x10                            \n\t" //Load address Column 8 of C
+" add x24,x23,x10                            \n\t" //Load address Column 9 of C
+" add x25,x24,x10                            \n\t" //Load address Column 10 of C
+" add x26,x25,x10                            \n\t" //Load address Column 11 of C
 "                                            \n\t"
-"                                            \n\t" // Vectors for result columns.
-" movi v8.4s,#0                              \n\t" // Vector for result column 0.
-" movi v9.4s,#0                              \n\t" // Vector for result column 1.
-" movi v10.4s,#0                             \n\t" // Vector for result column 2.
-" movi v11.4s,#0                             \n\t" // Vector for result column 3.
+" ldr q0, [x0]                               \n\t"
+" ldr q1, [x0, #16]                          \n\t" // Load a
 "                                            \n\t"
-"                                            \n\t" // Replicating accum. vectors for unrolling.
-" movi v12.4s,#0                             \n\t" // Vector 1 for accummulating column 0.
-" movi v13.4s,#0                             \n\t" // Vector 1 for accummulating column 1.
-" movi v14.4s,#0                             \n\t" // Vector 1 for accummulating column 2.
-" movi v15.4s,#0                             \n\t" // Vector 1 for accummulating column 3.
+" ldr q2, [x1]                               \n\t" // Load b
+" ldr q3, [x1, #16]                          \n\t"
+" ldr q4, [x1, #32]                          \n\t"
 "                                            \n\t"
-" movi v16.4s,#0                             \n\t" // Vector 2 for accummulating column 0.
-" movi v17.4s,#0                             \n\t" // Vector 2 for accummulating column 1.
-" movi v18.4s,#0                             \n\t" // Vector 2 for accummulating column 2.
-" movi v19.4s,#0                             \n\t" // Vector 2 for accummulating column 3.
+" prfm pldl1keep,[x2]                        \n\t" // Prefetch c.
+" prfm pldl1keep,[x16]                       \n\t" // Prefetch c.
+" prfm pldl1keep,[x17]                       \n\t" // Prefetch c.
+" prfm pldl1keep,[x18]                       \n\t" // Prefetch c.
+" prfm pldl1keep,[x19]                       \n\t" // Prefetch c.
+" prfm pldl1keep,[x20]                       \n\t" // Prefetch c.
+" prfm pldl1keep,[x21]                       \n\t" // Prefetch c.
+" prfm pldl1keep,[x22]                       \n\t" // Prefetch c.
+" prfm pldl1keep,[x23]                       \n\t" // Prefetch c.
+" prfm pldl1keep,[x24]                       \n\t" // Prefetch c.
+" prfm pldl1keep,[x25]                       \n\t" // Prefetch c.
+" prfm pldl1keep,[x26]                       \n\t" // Prefetch c.
 "                                            \n\t"
-" movi v20.4s,#0                             \n\t" // Vector 3 for accummulating column 0.
-" movi v21.4s,#0                             \n\t" // Vector 3 for accummulating column 1.
-" movi v22.4s,#0                             \n\t" // Vector 3 for accummulating column 2.
-" movi v23.4s,#0                             \n\t" // Vector 3 for accummulating column 3.
+" dup  v8.4s, wzr                            \n\t" // Vector for accummulating column 0
+" prfm    PLDL1KEEP, [x1, #192]              \n\t" 
+" dup  v9.4s, wzr                            \n\t" // Vector for accummulating column 0
+" prfm    PLDL1KEEP, [x1, #256]              \n\t"
+" dup  v10.4s, wzr                           \n\t" // Vector for accummulating column 1
+" prfm    PLDL1KEEP, [x1, #320]              \n\t"
+" dup  v11.4s, wzr                           \n\t" // Vector for accummulating column 1
+" dup  v12.4s, wzr                           \n\t" // Vector for accummulating column 2 
+" dup  v13.4s, wzr                           \n\t" // Vector for accummulating column 2
 "                                            \n\t"
-" movi v24.4s,#0                             \n\t" // Vector 4 for accummulating column 0.
-" movi v25.4s,#0                             \n\t" // Vector 4 for accummulating column 1.
-" movi v26.4s,#0                             \n\t" // Vector 4 for accummulating column 2.
-" movi v27.4s,#0                             \n\t" // Vector 4 for accummulating column 3.
+" dup  v14.4s, wzr                           \n\t" // Vector for accummulating column 3
+" prfm    PLDL1KEEP, [x0, #128]              \n\t"
+" dup  v15.4s, wzr                           \n\t" // Vector for accummulating column 3
+" prfm    PLDL1KEEP, [x0, #192]              \n\t"
+" dup  v16.4s, wzr                           \n\t" // Vector for accummulating column 4
+" dup  v17.4s, wzr                           \n\t" // Vector for accummulating column 4
+" dup  v18.4s, wzr                           \n\t" // Vector for accummulating column 5 
+" dup  v19.4s, wzr                           \n\t" // Vector for accummulating column 5
 "                                            \n\t"
-" ld1r {v31.4s},[x8]                         \n\t" // Load beta into quad.
+" dup  v20.4s, wzr                           \n\t" // Vector for accummulating column 6 
+" dup  v21.4s, wzr                           \n\t" // Vector for accummulating column 6
+" dup  v22.4s, wzr                           \n\t" // Vector for accummulating column 7
+" dup  v23.4s, wzr                           \n\t" // Vector for accummulating column 7
+" dup  v24.4s, wzr                           \n\t" // Vector for accummulating column 8 
+" dup  v25.4s, wzr                           \n\t" // Vector for accummulating column 8
+"                                            \n\t"
+" dup  v26.4s, wzr                           \n\t" // Vector for accummulating column 9 
+" dup  v27.4s, wzr                           \n\t" // Vector for accummulating column 9
+" dup  v28.4s, wzr                           \n\t" // Vector for accummulating column 10
+" dup  v29.4s, wzr                           \n\t" // Vector for accummulating column 10
+" dup  v30.4s, wzr                           \n\t" // Vector for accummulating column 11 
+" dup  v31.4s, wzr                           \n\t" // Vector for accummulating column 11
 "                                            \n\t"
 " cmp x5,#0                                  \n\t" // If k_iter == 0, jump to k_left.
 " beq .SCONSIDERKLEFT                        \n\t"
 "                                            \n\t"
+"add x0, x0, #32                             \n\t" //update address of A
+"add x1, x1, #48                             \n\t" //update address of B
+"                                            \n\t"
 " cmp x5,1                                   \n\t" // If there is just one k_iter, jump to that one. 
 " beq .SLASTITER                             \n\t" // (as loop is do-while-like).
 "                                            \n\t"
 " .SLOOPKITER:                               \n\t" // Body of the k_iter loop.
 "                                            \n\t"
-" prfm pldl1keep,[x0,#1024]                  \n\t" // Prefetch.
-" prfm pldl1keep,[x1,#1024]                  \n\t" // Prefetch.
+" ldr q5, [x0]                               \n\t"
+" fmla v8.4s, v0.4s,v2.s[0]                  \n\t" // Accummulate.
+" fmla v9.4s, v1.4s,v2.s[0]                  \n\t" // Accummulate.
+" ldr q6, [x0, #16]                          \n\t"
+" fmla v10.4s,v0.4s,v2.s[1]                  \n\t" // Accummulate.
+" fmla v11.4s,v1.4s,v2.s[1]                  \n\t" // Accummulate.
+" fmla v12.4s,v0.4s,v2.s[2]                  \n\t" // Accummulate.
+" fmla v13.4s,v1.4s,v2.s[2]                  \n\t" // Accummulate.
+" fmla v14.4s,v0.4s,v2.s[3]                  \n\t" // Accummulate.
+" fmla v15.4s,v1.4s,v2.s[3]                  \n\t" // Accummulate.
+" ldr q2, [x1]                               \n\t"
 "                                            \n\t"
-" fmla v12.4s,v0.4s,v4.s[0]                  \n\t" // Accummulate.
-" fmla v13.4s,v0.4s,v4.s[1]                  \n\t" // Accummulate.
+" fmla v16.4s,v0.4s,v3.s[0]                  \n\t" // Accummulate.
+" prfm    PLDL1KEEP, [x1, #336]              \n\t" 
+" fmla v17.4s,v1.4s,v3.s[0]                  \n\t" // Accummulate.
+" prfm    PLDL1KEEP, [x1, #400]              \n\t" 
+" fmla v18.4s,v0.4s,v3.s[1]                  \n\t" // Accummulate.
+" fmla v19.4s,v1.4s,v3.s[1]                  \n\t" // Accummulate.
+" prfm    PLDL1KEEP, [x1, #464]              \n\t" 
+" fmla v20.4s,v0.4s,v3.s[2]                  \n\t" // Accummulate.
+" fmla v21.4s,v1.4s,v3.s[2]                  \n\t" // Accummulate.
+" fmla v22.4s,v0.4s,v3.s[3]                  \n\t" // Accummulate.
+" fmla v23.4s,v1.4s,v3.s[3]                  \n\t" // Accummulate.
 "                                            \n\t"
-" ldp q6,q7,[x1,32]                          \n\t" // Load rows b+2,b+3 into quads.
+" fmla v24.4s,v0.4s,v4.s[0]                  \n\t" // Accummulate.
+" fmla v26.4s,v0.4s,v4.s[1]                  \n\t" // Accummulate.
+" fmla v28.4s,v0.4s,v4.s[2]                  \n\t" // Accummulate.
+" fmla v30.4s,v0.4s,v4.s[3]                  \n\t" // Accummulate.
+" ldr q3, [x1, #16]                          \n\t"
 "                                            \n\t"
-" fmla v14.4s,v0.4s,v4.s[2]                  \n\t" // Accummulate.
-" fmla v15.4s,v0.4s,v4.s[3]                  \n\t" // Accummulate.
+" fmla v25.4s,v1.4s,v4.s[0]                  \n\t" // Accummulate.
+" fmla v27.4s,v1.4s,v4.s[1]                  \n\t" // Accummulate.
+" fmla v29.4s,v1.4s,v4.s[2]                  \n\t" // Accummulate.
+" fmla v31.4s,v1.4s,v4.s[3]                  \n\t" // Accummulate.
+" ldr q4, [x1, #32]                          \n\t"
+"                                            \n\t" //End It 1
 "                                            \n\t"
-" ldp q2,q3,[x0,32]                          \n\t" // Load columns a+2,a+3 into quads.
+" ldr q0, [x0, #32]                          \n\t"
+" fmla v8.4s,v5.4s,v2.s[0]                   \n\t" // Accummulate.
+" fmla v9.4s,v6.4s,v2.s[0]                   \n\t" // Accummulate.
+" ldr q1, [x0, #48]                          \n\t"
+" fmla v10.4s,v5.4s,v2.s[1]                  \n\t" // Accummulate.
+" fmla v11.4s,v6.4s,v2.s[1]                  \n\t" // Accummulate.
+" fmla v12.4s,v5.4s,v2.s[2]                  \n\t" // Accummulate.
+" fmla v13.4s,v6.4s,v2.s[2]                  \n\t" // Accummulate.
+" fmla v14.4s,v5.4s,v2.s[3]                  \n\t" // Accummulate.
+" fmla v15.4s,v6.4s,v2.s[3]                  \n\t" // Accummulate.
+" ldr q2, [x1, #48]                          \n\t"
 "                                            \n\t"
-" fmla v16.4s,v1.4s,v5.s[0]                  \n\t" // Accummulate.
-" fmla v17.4s,v1.4s,v5.s[1]                  \n\t" // Accummulate.
+" fmla v16.4s,v5.4s,v3.s[0]                  \n\t" // Accummulate.
+" prfm    PLDL1KEEP, [x0, #224]              \n\t"
+" fmla v17.4s,v6.4s,v3.s[0]                  \n\t" // Accummulate.
+" prfm    PLDL1KEEP, [x0, #288]              \n\t"
+" fmla v18.4s,v5.4s,v3.s[1]                  \n\t" // Accummulate.
+" fmla v19.4s,v6.4s,v3.s[1]                  \n\t" // Accummulate.
+" fmla v20.4s,v5.4s,v3.s[2]                  \n\t" // Accummulate.
+" fmla v21.4s,v6.4s,v3.s[2]                  \n\t" // Accummulate.
+" fmla v22.4s,v5.4s,v3.s[3]                  \n\t" // Accummulate.
+" fmla v23.4s,v6.4s,v3.s[3]                  \n\t" // Accummulate.
 "                                            \n\t"
-" fmla v18.4s,v1.4s,v5.s[2]                  \n\t" // Accummulate.
-" fmla v19.4s,v1.4s,v5.s[3]                  \n\t" // Accummulate.
+" fmla v24.4s,v5.4s,v4.s[0]                  \n\t" // Accummulate.
+" fmla v26.4s,v5.4s,v4.s[1]                  \n\t" // Accummulate.
+" fmla v28.4s,v5.4s,v4.s[2]                  \n\t" // Accummulate.
+" fmla v30.4s,v5.4s,v4.s[3]                  \n\t" // Accummulate.
+" ldr q3, [x1, #64]                          \n\t"
 "                                            \n\t"
-" add x0,x0,64                               \n\t" // Update a_ptr.
-" add x1,x1,64                               \n\t" // Update b_ptr.
+" fmla v25.4s,v6.4s,v4.s[0]                  \n\t" // Accummulate.
+" fmla v27.4s,v6.4s,v4.s[1]                  \n\t" // Accummulate.
+" fmla v29.4s,v6.4s,v4.s[2]                  \n\t" // Accummulate.
+" fmla v31.4s,v6.4s,v4.s[3]                  \n\t" // Accummulate.
+" ldr q4, [x1, #80]                          \n\t"
+"                                            \n\t" //End It 2
 "                                            \n\t"
-" fmla v20.4s,v2.4s,v6.s[0]                  \n\t" // Accummulate.
-" fmla v21.4s,v2.4s,v6.s[1]                  \n\t" // Accummulate.
+" ldr q5, [x0, #64]                          \n\t"
+" fmla v8.4s,v0.4s,v2.s[0]                   \n\t" // Accummulate.
+" fmla v9.4s,v1.4s,v2.s[0]                   \n\t" // Accummulate.
+" ldr q6, [x0, #80]                          \n\t"
+" fmla v10.4s,v0.4s,v2.s[1]                  \n\t" // Accummulate.
+" fmla v11.4s,v1.4s,v2.s[1]                  \n\t" // Accummulate.
+" fmla v12.4s,v0.4s,v2.s[2]                  \n\t" // Accummulate.
+" fmla v13.4s,v1.4s,v2.s[2]                  \n\t" // Accummulate.
+" fmla v14.4s,v0.4s,v2.s[3]                  \n\t" // Accummulate.
+" fmla v15.4s,v1.4s,v2.s[3]                  \n\t" // Accummulate.
+" ldr q2, [x1, #96]                          \n\t"
 "                                            \n\t"
-" ldp q0,q1,[x0]                             \n\t" // Load columns a,a+1 into quads (next iteration).
+" fmla v16.4s,v0.4s,v3.s[0]                  \n\t" // Accummulate.
+" fmla v17.4s,v1.4s,v3.s[0]                  \n\t" // Accummulate.
+" fmla v18.4s,v0.4s,v3.s[1]                  \n\t" // Accummulate.
+" fmla v19.4s,v1.4s,v3.s[1]                  \n\t" // Accummulate.
+" fmla v20.4s,v0.4s,v3.s[2]                  \n\t" // Accummulate.
+" fmla v21.4s,v1.4s,v3.s[2]                  \n\t" // Accummulate.
+" fmla v22.4s,v0.4s,v3.s[3]                  \n\t" // Accummulate.
+" fmla v23.4s,v1.4s,v3.s[3]                  \n\t" // Accummulate.
 "                                            \n\t"
-" fmla v22.4s,v2.4s,v6.s[2]                  \n\t" // Accummulate.
-" fmla v23.4s,v2.4s,v6.s[3]                  \n\t" // Accummulate.
+" fmla v24.4s,v0.4s,v4.s[0]                  \n\t" // Accummulate.
+" fmla v26.4s,v0.4s,v4.s[1]                  \n\t" // Accummulate.
+" fmla v28.4s,v0.4s,v4.s[2]                  \n\t" // Accummulate.
+" fmla v30.4s,v0.4s,v4.s[3]                  \n\t" // Accummulate.
+" ldr q3, [x1, #112]                         \n\t"
 "                                            \n\t"
-" ldp q4,q5,[x1]                             \n\t" // Load rows b,b+1 into quads (next iteration).
+" fmla v25.4s,v1.4s,v4.s[0]                  \n\t" // Accummulate.
+" fmla v27.4s,v1.4s,v4.s[1]                  \n\t" // Accummulate.
+" fmla v29.4s,v1.4s,v4.s[2]                  \n\t" // Accummulate.
+" fmla v31.4s,v1.4s,v4.s[3]                  \n\t" // Accummulate.
+" ldr q4, [x1, #128]                         \n\t"
+"                                            \n\t" //End It 3
 "                                            \n\t"
-" fmla v24.4s,v3.4s,v7.s[0]                  \n\t" // Accummulate.
-" fmla v25.4s,v3.4s,v7.s[1]                  \n\t" // Accummulate.
+" ldr q0, [x0, #96]                          \n\t"
+" fmla v8.4s,v5.4s,v2.s[0]                   \n\t" // Accummulate.
+" fmla v9.4s,v6.4s,v2.s[0]                   \n\t" // Accummulate.
+" ldr q1, [x0, #112]                         \n\t"
+" fmla v10.4s,v5.4s,v2.s[1]                  \n\t" // Accummulate.
+" fmla v11.4s,v6.4s,v2.s[1]                  \n\t" // Accummulate.
+" fmla v12.4s,v5.4s,v2.s[2]                  \n\t" // Accummulate.
+" fmla v13.4s,v6.4s,v2.s[2]                  \n\t" // Accummulate.
+" fmla v14.4s,v5.4s,v2.s[3]                  \n\t" // Accummulate.
+" fmla v15.4s,v6.4s,v2.s[3]                  \n\t" // Accummulate.
+" ldr q2, [x1, #144]                         \n\t"
 "                                            \n\t"
-" prfm pldl1keep,[x0,#64]                    \n\t" // Prefetch.
-" prfm pldl1keep,[x1,#64]                    \n\t" // Prefetch.
+" fmla v16.4s,v5.4s,v3.s[0]                  \n\t" // Accummulate.
+" fmla v17.4s,v6.4s,v3.s[0]                  \n\t" // Accummulate.
+" fmla v18.4s,v5.4s,v3.s[1]                  \n\t" // Accummulate.
+" fmla v19.4s,v6.4s,v3.s[1]                  \n\t" // Accummulate.
+" fmla v20.4s,v5.4s,v3.s[2]                  \n\t" // Accummulate.
+" fmla v21.4s,v6.4s,v3.s[2]                  \n\t" // Accummulate.
+" fmla v22.4s,v5.4s,v3.s[3]                  \n\t" // Accummulate.
+" fmla v23.4s,v6.4s,v3.s[3]                  \n\t" // Accummulate.
 "                                            \n\t"
-" fmla v26.4s,v3.4s,v7.s[2]                  \n\t" // Accummulate.
-" fmla v27.4s,v3.4s,v7.s[3]                  \n\t" // Accummulate.
+" fmla v24.4s,v5.4s,v4.s[0]                  \n\t" // Accummulate.
+" fmla v26.4s,v5.4s,v4.s[1]                  \n\t" // Accummulate.
+" fmla v28.4s,v5.4s,v4.s[2]                  \n\t" // Accummulate.
+" fmla v30.4s,v5.4s,v4.s[3]                  \n\t" // Accummulate.
+" ldr q3, [x1, #160]                         \n\t"
 "                                            \n\t"
+" fmla v25.4s,v6.4s,v4.s[0]                  \n\t" // Accummulate.
+" fmla v27.4s,v6.4s,v4.s[1]                  \n\t" // Accummulate.
+" fmla v29.4s,v6.4s,v4.s[2]                  \n\t" // Accummulate.
+" fmla v31.4s,v6.4s,v4.s[3]                  \n\t" // Accummulate.
+" ldr q4, [x1, #176]                         \n\t"
+" add x1, x1, #192                           \n\t"
+" add x0, x0, #128                           \n\t"
+"                                            \n\t" //End It 4
 " sub x5,x5,1                                \n\t" // i-=1.
 " cmp x5,1                                   \n\t" // Iterate again if we are not in k_iter == 1.
 " bne .SLOOPKITER                            \n\t"
 "                                            \n\t" 
-//" prfm pldl1keep,[x0,#1024]                \n\t"
-//" prfm pldl1keep,[x1,#1024]                \n\t"
-"                                            \n\t" 
 " .SLASTITER:                                \n\t" // Last iteration of k_iter loop.
 "                                            \n\t" 
-" fmla v12.4s,v0.4s,v4.s[0]                  \n\t" // Accummulate.
-" fmla v13.4s,v0.4s,v4.s[1]                  \n\t" // Accummulate.
 "                                            \n\t"
-" ldp q6,q7,[x1,32]                          \n\t" // Load rows b+2,b+3 into quads.
+" ldr q5, [x0]                               \n\t"
+" fmla v8.4s,v0.4s,v2.s[0]                   \n\t" // Accummulate.
+" fmla v9.4s,v1.4s,v2.s[0]                   \n\t" // Accummulate.
+" ldr q6, [x0, #16]                          \n\t"
+" fmla v10.4s,v0.4s,v2.s[1]                  \n\t" // Accummulate.
+" fmla v11.4s,v1.4s,v2.s[1]                  \n\t" // Accummulate.
+" fmla v12.4s,v0.4s,v2.s[2]                  \n\t" // Accummulate.
+" fmla v13.4s,v1.4s,v2.s[2]                  \n\t" // Accummulate.
+" fmla v14.4s,v0.4s,v2.s[3]                  \n\t" // Accummulate.
+" fmla v15.4s,v1.4s,v2.s[3]                  \n\t" // Accummulate.
+" ldr q2, [x1]                               \n\t"
 "                                            \n\t"
-" fmla v14.4s,v0.4s,v4.s[2]                  \n\t" // Accummulate.
-" fmla v15.4s,v0.4s,v4.s[3]                  \n\t" // Accummulate.
+" fmla v16.4s,v0.4s,v3.s[0]                  \n\t" // Accummulate.
+" fmla v17.4s,v1.4s,v3.s[0]                  \n\t" // Accummulate.
+" fmla v18.4s,v0.4s,v3.s[1]                  \n\t" // Accummulate.
+" fmla v19.4s,v1.4s,v3.s[1]                  \n\t" // Accummulate.
+" fmla v20.4s,v0.4s,v3.s[2]                  \n\t" // Accummulate.
+" fmla v21.4s,v1.4s,v3.s[2]                  \n\t" // Accummulate.
+" fmla v22.4s,v0.4s,v3.s[3]                  \n\t" // Accummulate.
+" fmla v23.4s,v1.4s,v3.s[3]                  \n\t" // Accummulate.
 "                                            \n\t"
-" ldp q2,q3,[x0,32]                          \n\t" // Load columns a+2,a+3 into quads.
+" fmla v24.4s,v0.4s,v4.s[0]                  \n\t" // Accummulate.
+" fmla v26.4s,v0.4s,v4.s[1]                  \n\t" // Accummulate.
+" fmla v28.4s,v0.4s,v4.s[2]                  \n\t" // Accummulate.
+" fmla v30.4s,v0.4s,v4.s[3]                  \n\t" // Accummulate.
+" ldr q3, [x1, #16]                          \n\t"
 "                                            \n\t"
-" fmla v16.4s,v1.4s,v5.s[0]                  \n\t" // Accummulate.
-" fmla v17.4s,v1.4s,v5.s[1]                  \n\t" // Accummulate.
+" fmla v25.4s,v1.4s,v4.s[0]                  \n\t" // Accummulate.
+" fmla v27.4s,v1.4s,v4.s[1]                  \n\t" // Accummulate.
+" fmla v29.4s,v1.4s,v4.s[2]                  \n\t" // Accummulate.
+" fmla v31.4s,v1.4s,v4.s[3]                  \n\t" // Accummulate.
+" ldr q4, [x1, #32]                          \n\t"
+"                                            \n\t" //End It 1
 "                                            \n\t"
-" ld1r {v30.4s},[x7]                         \n\t" // Load alpha.
+" ldr q0, [x0, #32]                          \n\t"
+" fmla v8.4s,v5.4s,v2.s[0]                   \n\t" // Accummulate.
+" fmla v9.4s,v6.4s,v2.s[0]                   \n\t" // Accummulate.
+" ldr q1, [x0, #48]                          \n\t"
+" fmla v10.4s,v5.4s,v2.s[1]                  \n\t" // Accummulate.
+" fmla v11.4s,v6.4s,v2.s[1]                  \n\t" // Accummulate.
+" fmla v12.4s,v5.4s,v2.s[2]                  \n\t" // Accummulate.
+" fmla v13.4s,v6.4s,v2.s[2]                  \n\t" // Accummulate.
+" fmla v14.4s,v5.4s,v2.s[3]                  \n\t" // Accummulate.
+" fmla v15.4s,v6.4s,v2.s[3]                  \n\t" // Accummulate.
+" ldr q2, [x1, #48]                          \n\t"
 "                                            \n\t"
-" fmla v18.4s,v1.4s,v5.s[2]                  \n\t" // Accummulate.
-" fmla v19.4s,v1.4s,v5.s[3]                  \n\t" // Accummulate.
+" fmla v16.4s,v5.4s,v3.s[0]                  \n\t" // Accummulate.
+" fmla v17.4s,v6.4s,v3.s[0]                  \n\t" // Accummulate.
+" fmla v18.4s,v5.4s,v3.s[1]                  \n\t" // Accummulate.
+" fmla v19.4s,v6.4s,v3.s[1]                  \n\t" // Accummulate.
+" fmla v20.4s,v5.4s,v3.s[2]                  \n\t" // Accummulate.
+" fmla v21.4s,v6.4s,v3.s[2]                  \n\t" // Accummulate.
+" fmla v22.4s,v5.4s,v3.s[3]                  \n\t" // Accummulate.
+" fmla v23.4s,v6.4s,v3.s[3]                  \n\t" // Accummulate.
 "                                            \n\t"
-" fmla v20.4s,v2.4s,v6.s[0]                  \n\t" // Accummulate.
-" fmla v21.4s,v2.4s,v6.s[1]                  \n\t" // Accummulate.
+" fmla v24.4s,v5.4s,v4.s[0]                  \n\t" // Accummulate.
+" fmla v26.4s,v5.4s,v4.s[1]                  \n\t" // Accummulate.
+" fmla v28.4s,v5.4s,v4.s[2]                  \n\t" // Accummulate.
+" fmla v30.4s,v5.4s,v4.s[3]                  \n\t" // Accummulate.
+" ldr q3, [x1, #64]                          \n\t"
 "                                            \n\t"
-" fmla v22.4s,v2.4s,v6.s[2]                  \n\t" // Accummulate.
-" fmla v23.4s,v2.4s,v6.s[3]                  \n\t" // Accummulate.
+" fmla v25.4s,v6.4s,v4.s[0]                  \n\t" // Accummulate.
+" fmla v27.4s,v6.4s,v4.s[1]                  \n\t" // Accummulate.
+" fmla v29.4s,v6.4s,v4.s[2]                  \n\t" // Accummulate.
+" fmla v31.4s,v6.4s,v4.s[3]                  \n\t" // Accummulate.
+" ldr q4, [x1, #80]                          \n\t"
+"                                            \n\t" //End It 2
 "                                            \n\t"
-" fmla v24.4s,v3.4s,v7.s[0]                  \n\t" // Accummulate.
-" fmla v25.4s,v3.4s,v7.s[1]                  \n\t" // Accummulate.
+" ldr q5, [x0, #64]                          \n\t"
+" fmla v8.4s,v0.4s,v2.s[0]                   \n\t" // Accummulate.
+" fmla v9.4s,v1.4s,v2.s[0]                   \n\t" // Accummulate.
+" ldr q6, [x0, #80]                          \n\t"
+" fmla v10.4s,v0.4s,v2.s[1]                  \n\t" // Accummulate.
+" fmla v11.4s,v1.4s,v2.s[1]                  \n\t" // Accummulate.
+" fmla v12.4s,v0.4s,v2.s[2]                  \n\t" // Accummulate.
+" fmla v13.4s,v1.4s,v2.s[2]                  \n\t" // Accummulate.
+" fmla v14.4s,v0.4s,v2.s[3]                  \n\t" // Accummulate.
+" fmla v15.4s,v1.4s,v2.s[3]                  \n\t" // Accummulate.
+" ldr q2, [x1, #96]                          \n\t"
 "                                            \n\t"
-" fmla v26.4s,v3.4s,v7.s[2]                  \n\t" // Accummulate.
-" fmla v27.4s,v3.4s,v7.s[3]                  \n\t" // Accummulate.
+" fmla v16.4s,v0.4s,v3.s[0]                  \n\t" // Accummulate.
+" fmla v17.4s,v1.4s,v3.s[0]                  \n\t" // Accummulate.
+" fmla v18.4s,v0.4s,v3.s[1]                  \n\t" // Accummulate.
+" fmla v19.4s,v1.4s,v3.s[1]                  \n\t" // Accummulate.
+" fmla v20.4s,v0.4s,v3.s[2]                  \n\t" // Accummulate.
+" fmla v21.4s,v1.4s,v3.s[2]                  \n\t" // Accummulate.
+" fmla v22.4s,v0.4s,v3.s[3]                  \n\t" // Accummulate.
+" fmla v23.4s,v1.4s,v3.s[3]                  \n\t" // Accummulate.
 "                                            \n\t"
-//" ld1 {v8.4s},[x2],x10                       \n\t" // Load c    into quad and increment by cs_c
-//" ld1 {v9.4s},[x2],x10                       \n\t" // Load c+4  into quad and increment by cs_c
-//" ld1 {v10.4s},[x2],x10                      \n\t" // Load c+8  into quad and increment by cs_c
-//" ld1 {v11.4s},[x2],x10                      \n\t" // Load c+16 into quad and increment by cs_c
+" fmla v24.4s,v0.4s,v4.s[0]                  \n\t" // Accummulate.
+" fmla v26.4s,v0.4s,v4.s[1]                  \n\t" // Accummulate.
+" fmla v28.4s,v0.4s,v4.s[2]                  \n\t" // Accummulate.
+" fmla v30.4s,v0.4s,v4.s[3]                  \n\t" // Accummulate.
+" ldr q3, [x1, #112]                         \n\t"
 "                                            \n\t"
-" fadd v12.4s,v12.4s,v16.4s                  \n\t" // Final accummulate of temporal accum. vectors.
-" fadd v13.4s,v13.4s,v17.4s                  \n\t" // Final accummulate of temporal accum. vectors.
-" fadd v14.4s,v14.4s,v18.4s                  \n\t" // Final accummulate of temporal accum. vectors.
-" fadd v15.4s,v15.4s,v19.4s                  \n\t" // Final accummulate of temporal accum. vectors.
-" fadd v12.4s,v12.4s,v20.4s                  \n\t" // Final accummulate of temporal accum. vectors.
-" fadd v13.4s,v13.4s,v21.4s                  \n\t" // Final accummulate of temporal accum. vectors.
-" fadd v14.4s,v14.4s,v22.4s                  \n\t" // Final accummulate of temporal accum. vectors.
-" fadd v15.4s,v15.4s,v23.4s                  \n\t" // Final accummulate of temporal accum. vectors.
-" fadd v12.4s,v12.4s,v24.4s                  \n\t" // Final accummulate of temporal accum. vectors.
-" fadd v13.4s,v13.4s,v25.4s                  \n\t" // Final accummulate of temporal accum. vectors.
-" fadd v14.4s,v14.4s,v26.4s                  \n\t" // Final accummulate of temporal accum. vectors.
-" fadd v15.4s,v15.4s,v27.4s                  \n\t" // Final accummulate of temporal accum. vectors.
+" fmla v25.4s,v1.4s,v4.s[0]                  \n\t" // Accummulate.
+" fmla v27.4s,v1.4s,v4.s[1]                  \n\t" // Accummulate.
+" fmla v29.4s,v1.4s,v4.s[2]                  \n\t" // Accummulate.
+" fmla v31.4s,v1.4s,v4.s[3]                  \n\t" // Accummulate.
+" ldr q4, [x1, #128]                         \n\t"
+"                                            \n\t" //End It 3
 "                                            \n\t"
-" add x0,x0,64                               \n\t" // Update a_ptr.
-" add x1,x1,64                               \n\t" // Update b_ptr.
+" fmla v8.4s,v5.4s,v2.s[0]                   \n\t" // Accummulate.
+" fmla v9.4s,v6.4s,v2.s[0]                   \n\t" // Accummulate.
+" fmla v10.4s,v5.4s,v2.s[1]                  \n\t" // Accummulate.
+" fmla v11.4s,v6.4s,v2.s[1]                  \n\t" // Accummulate.
+" fmla v12.4s,v5.4s,v2.s[2]                  \n\t" // Accummulate.
+" fmla v13.4s,v6.4s,v2.s[2]                  \n\t" // Accummulate.
+" fmla v14.4s,v5.4s,v2.s[3]                  \n\t" // Accummulate.
+" fmla v15.4s,v6.4s,v2.s[3]                  \n\t" // Accummulate.
+"                                            \n\t"
+" fmla v16.4s,v5.4s,v3.s[0]                  \n\t" // Accummulate.
+" fmla v17.4s,v6.4s,v3.s[0]                  \n\t" // Accummulate.
+" fmla v18.4s,v5.4s,v3.s[1]                  \n\t" // Accummulate.
+" fmla v19.4s,v6.4s,v3.s[1]                  \n\t" // Accummulate.
+" fmla v20.4s,v5.4s,v3.s[2]                  \n\t" // Accummulate.
+" fmla v21.4s,v6.4s,v3.s[2]                  \n\t" // Accummulate.
+" fmla v22.4s,v5.4s,v3.s[3]                  \n\t" // Accummulate.
+" fmla v23.4s,v6.4s,v3.s[3]                  \n\t" // Accummulate.
+"                                            \n\t"
+" fmla v24.4s,v5.4s,v4.s[0]                  \n\t" // Accummulate.
+" fmla v26.4s,v5.4s,v4.s[1]                  \n\t" // Accummulate.
+" fmla v28.4s,v5.4s,v4.s[2]                  \n\t" // Accummulate.
+" fmla v30.4s,v5.4s,v4.s[3]                  \n\t" // Accummulate.
+"                                            \n\t"
+" fmla v25.4s,v6.4s,v4.s[0]                  \n\t" // Accummulate.
+" fmla v27.4s,v6.4s,v4.s[1]                  \n\t" // Accummulate.
+" fmla v29.4s,v6.4s,v4.s[2]                  \n\t" // Accummulate.
+" fmla v31.4s,v6.4s,v4.s[3]                  \n\t" // Accummulate.
+" add x1, x1, #144                           \n\t"
+" add x0, x0, #96                            \n\t"
+"                                            \n\t" //End It 4
 "                                            \n\t"
 " .SCONSIDERKLEFT:                           \n\t" 
 " cmp x6,0                                   \n\t" // If k_left == 0, we are done.
@@ -233,165 +453,595 @@ __asm__ volatile
 "                                            \n\t"
 " .SLOOPKLEFT:                               \n\t" // Body of the left iterations
 "                                            \n\t"
-" prfm pldl1keep,[x0,#1024]                  \n\t" // Prefetch.
-" prfm pldl1keep,[x1,#1024]                  \n\t" // Prefetch.
+" ldr q0, [x0],#16                           \n\t"
+" ldr q1, [x0],#16                           \n\t" // Load a
 "                                            \n\t"
-" ldr q0,[x0]                                \n\t" // Load a into quad (next iteration).
-" ldr q4,[x1]                                \n\t" // Load b into quad (next iteration).
-"                                            \n\t"
-" add x0,x0,16                               \n\t" // Update a_ptr.
-" add x1,x1,16                               \n\t" // Update b_ptr.
+" ldr q2, [x1],#16                           \n\t" // Load b
+" ldr q3, [x1],#16                           \n\t"
+" ldr q4, [x1],#16                           \n\t"
 "                                            \n\t"
 " sub x6,x6,1                                \n\t" // i = i-1.
 "                                            \n\t"
-" fmla v12.4s,v0.4s,v4.s[0]                  \n\t" // Accummulate.
-" fmla v13.4s,v0.4s,v4.s[1]                  \n\t" // Accummulate.
+" fmla v8.4s,v0.4s,v2.s[0]                   \n\t" // Accummulate.
+" fmla v9.4s,v1.4s,v2.s[0]                   \n\t" // Accummulate.
+" fmla v10.4s,v0.4s,v2.s[1]                  \n\t" // Accummulate.
+" fmla v11.4s,v1.4s,v2.s[1]                  \n\t" // Accummulate.
+" fmla v12.4s,v0.4s,v2.s[2]                  \n\t" // Accummulate.
+" fmla v13.4s,v1.4s,v2.s[2]                  \n\t" // Accummulate.
+" fmla v14.4s,v0.4s,v2.s[3]                  \n\t" // Accummulate.
+" fmla v15.4s,v1.4s,v2.s[3]                  \n\t" // Accummulate.
 "                                            \n\t"
-" fmla v14.4s,v0.4s,v4.s[2]                  \n\t" // Accummulate.
-" fmla v15.4s,v0.4s,v4.s[3]                  \n\t" // Accummulate.
+" fmla v16.4s,v0.4s,v3.s[0]                  \n\t" // Accummulate.
+" fmla v17.4s,v1.4s,v3.s[0]                  \n\t" // Accummulate.
+" fmla v18.4s,v0.4s,v3.s[1]                  \n\t" // Accummulate.
+" fmla v19.4s,v1.4s,v3.s[1]                  \n\t" // Accummulate.
+" fmla v20.4s,v0.4s,v3.s[2]                  \n\t" // Accummulate.
+" fmla v21.4s,v1.4s,v3.s[2]                  \n\t" // Accummulate.
+" fmla v22.4s,v0.4s,v3.s[3]                  \n\t" // Accummulate.
+" fmla v23.4s,v1.4s,v3.s[3]                  \n\t" // Accummulate.
+"                                            \n\t"
+" fmla v24.4s,v0.4s,v4.s[0]                  \n\t" // Accummulate.
+" fmla v26.4s,v0.4s,v4.s[1]                  \n\t" // Accummulate.
+" fmla v28.4s,v0.4s,v4.s[2]                  \n\t" // Accummulate.
+" fmla v30.4s,v0.4s,v4.s[3]                  \n\t" // Accummulate.
+" fmla v25.4s,v1.4s,v4.s[0]                  \n\t" // Accummulate.
+" fmla v27.4s,v1.4s,v4.s[1]                  \n\t" // Accummulate.
+" fmla v29.4s,v1.4s,v4.s[2]                  \n\t" // Accummulate.
+" fmla v31.4s,v1.4s,v4.s[3]                  \n\t" // Accummulate.
 "                                            \n\t"
 " cmp x6,0                                   \n\t" // Iterate again.
 " bne .SLOOPKLEFT                            \n\t" // if i!=0.
 "                                            \n\t"
-" ld1r {v30.4s},[x7]                         \n\t" // Load alpha.
-"                                            \n\t"
-" ldr x2,%[caddr]                            \n\t" // Load address of C.
-"                                            \n\t"
 " .SPOSTACCUM:                               \n\t"
+"                                            \n\t"
+" ld1r {v6.4s},[x7]                          \n\t" // Load alpha.
+" ld1r {v7.4s},[x8]                          \n\t" // Load beta
+"                                            \n\t"
 " cmp x13,#1                                 \n\t" // If rs_c != 1 (column-major)
 " bne .SGENSTORED                            \n\t"
 "                                            \n\t"
-"                                            \n\t"
 " .SCOLSTORED:                               \n\t" // C is column-major.
 "                                            \n\t"
-" fcmp s31,#0.0                              \n\t"
-" beq .BETAZEROCOLSTORED                     \n\t" // Taking care of the beta==0 case.
+" dup  v0.4s, wzr                            \n\t"
+" dup  v1.4s, wzr                            \n\t"
+" dup  v2.4s, wzr                            \n\t"
+" dup  v3.4s, wzr                            \n\t"
+" dup  v4.4s, wzr                            \n\t"
+" dup  v5.4s, wzr                            \n\t"
 "                                            \n\t"
-"                                            \n\t" // If beta!=0, then we can read from C.
-" ld1 {v8.4s},[x2],x10                       \n\t" // Load c    into quad and increment by cs_c.
-" ld1 {v9.4s},[x2],x10                       \n\t" // Load c+4  into quad and increment by cs_c.
-" ld1 {v10.4s},[x2],x10                      \n\t" // Load c+8  into quad and increment by cs_c.
-" ld1 {v11.4s},[x2],x10                      \n\t" // Load c+16 into quad and increment by cs_c.
+" fcmp s7,#0.0                               \n\t"
+" beq .SBETAZEROCOLSTOREDS1                  \n\t" // Taking care of the beta==0 case.
 "                                            \n\t"
-" prfm pldl1keep,[x16,0]                     \n\t" // Prefetch.
-" prfm pldl1keep,[x17,0]                     \n\t" // Prefetch.
+" ldr q0, [x2]                               \n\t" //Load column 0 of C
+" ldr q1, [x2, #16]                          \n\t"
+" ldr q2, [x16]                              \n\t" //Load column 1 of C
+" ldr q3, [x16, #16]                         \n\t"
+" ldr q4, [x17]                              \n\t" //Load column 2 of C
+" ldr q5, [x17, #16]                         \n\t"
 "                                            \n\t"
+" fmul v0.4s,v0.4s,v7.s[0]                   \n\t" // Scale by beta
+" fmul v1.4s,v1.4s,v7.s[0]                   \n\t" // Scale by beta
+" fmul v2.4s,v2.4s,v7.s[0]                   \n\t" // Scale by beta
+" fmul v3.4s,v3.4s,v7.s[0]                   \n\t" // Scale by beta
+" fmul v4.4s,v4.4s,v7.s[0]                   \n\t" // Scale by beta
+" fmul v5.4s,v5.4s,v7.s[0]                   \n\t" // Scale by beta
 "                                            \n\t"
-" fmul v8.4s,v8.4s,v31.s[0]                  \n\t" // Scale by beta.
-" fmul v9.4s,v9.4s,v31.s[0]                  \n\t" // Scale by beta.
-" fmul v10.4s,v10.4s,v31.s[0]                \n\t" // Scale by beta.
-" fmul v11.4s,v11.4s,v31.s[0]                \n\t" // Scale by beta.
+" .SBETAZEROCOLSTOREDS1:                     \n\t"
 "                                            \n\t"
-" .BETAZEROCOLSTORED:                        \n\t" // If beta==0, we won't read from C (nor scale).
+" fmla v0.4s,v8.4s,v6.s[0]                   \n\t" // Scale by alpha
+" fmla v1.4s,v9.4s,v6.s[0]                   \n\t" // Scale by alpha
+" fmla v2.4s,v10.4s,v6.s[0]                  \n\t" // Scale by alpha
+" fmla v3.4s,v11.4s,v6.s[0]                  \n\t" // Scale by alpha
+" fmla v4.4s,v12.4s,v6.s[0]                  \n\t" // Scale by alpha
+" fmla v5.4s,v13.4s,v6.s[0]                  \n\t" // Scale by alpha
 "                                            \n\t"
-" ldr x2,%[caddr]                            \n\t" // Load address of C.
+" str q0, [x2]                               \n\t" //Store column 0 of C
+" str q1, [x2, #16]                          \n\t"
+" str q2, [x16]                              \n\t" //Store column 1 of C
+" str q3, [x16, #16]                         \n\t"
+" str q4, [x17]                              \n\t" //Store column 2 of C
+" str q5, [x17, #16]                         \n\t"
 "                                            \n\t"
-" fmla v8.4s,v12.4s,v30.s[0]                 \n\t" // Scale by alpha
-" fmla v9.4s,v13.4s,v30.s[0]                 \n\t" // Scale by alpha
-" fmla v10.4s,v14.4s,v30.s[0]                \n\t" // Scale by alpha
-" fmla v11.4s,v15.4s,v30.s[0]                \n\t" // Scale by alpha
+" dup  v8.4s, wzr                            \n\t"
+" dup  v9.4s, wzr                            \n\t"
+" dup  v10.4s, wzr                           \n\t"
+" dup  v11.4s, wzr                           \n\t"
+" dup  v12.4s, wzr                           \n\t"
+" dup  v13.4s, wzr                           \n\t"
+"                                            \n\t"
+" fcmp s7,#0.0                               \n\t"
+" beq .SBETAZEROCOLSTOREDS2                  \n\t" // Taking care of the beta==0 case.
+"                                            \n\t"
+" ldr q8, [x18]                              \n\t" //Load column 3 of C
+" ldr q9, [x18, #16]                         \n\t"
+" ldr q10, [x19]                             \n\t" //Load column 4 of C
+" ldr q11, [x19, #16]                        \n\t"
+" ldr q12, [x20]                             \n\t" //Load column 5 of C
+" ldr q13, [x20, #16]                        \n\t"
+"                                            \n\t"
+" fmul v8.4s, v8.4s, v7.s[0]                 \n\t" // Scale by beta
+" fmul v9.4s, v9.4s, v7.s[0]                 \n\t" // Scale by beta
+" fmul v10.4s,v10.4s,v7.s[0]                 \n\t" // Scale by beta
+" fmul v11.4s,v11.4s,v7.s[0]                 \n\t" // Scale by beta
+" fmul v12.4s,v12.4s,v7.s[0]                 \n\t" // Scale by beta
+" fmul v13.4s,v13.4s,v7.s[0]                 \n\t" // Scale by beta
+"                                            \n\t"
+" .SBETAZEROCOLSTOREDS2:                     \n\t"
+"                                            \n\t"
+" fmla v8.4s, v14.4s,v6.s[0]                 \n\t" // Scale by alpha
+" fmla v9.4s, v15.4s,v6.s[0]                 \n\t" // Scale by alpha
+" fmla v10.4s,v16.4s,v6.s[0]                 \n\t" // Scale by alpha
+" fmla v11.4s,v17.4s,v6.s[0]                 \n\t" // Scale by alpha
+" fmla v12.4s,v18.4s,v6.s[0]                 \n\t" // Scale by alpha
+" fmla v13.4s,v19.4s,v6.s[0]                 \n\t" // Scale by alpha
+"                                            \n\t"
+" str q8, [x18]                              \n\t" //Store column 3 of C
+" str q9, [x18, #16]                         \n\t"
+" str q10, [x19]                             \n\t" //Store column 4 of C
+" str q11, [x19, #16]                        \n\t"
+" str q12, [x20]                             \n\t" //Store column 5 of C
+" str q13, [x20, #16]                        \n\t"
+"                                            \n\t"
+" dup  v0.4s, wzr                            \n\t"
+" dup  v1.4s, wzr                            \n\t"
+" dup  v2.4s, wzr                            \n\t"
+" dup  v3.4s, wzr                            \n\t"
+" dup  v4.4s, wzr                            \n\t"
+" dup  v5.4s, wzr                            \n\t"
+"                                            \n\t"
+" fcmp s7,#0.0                               \n\t"
+" beq .SBETAZEROCOLSTOREDS3                  \n\t" // Taking care of the beta==0 case.
+"                                            \n\t"
+" ldr q0, [x21]                              \n\t" //Load column 6 of C
+" ldr q1, [x21, #16]                         \n\t"
+" ldr q2, [x22]                              \n\t" //Load column 7 of C
+" ldr q3, [x22, #16]                         \n\t"
+" ldr q4, [x23]                              \n\t" //Load column 8 of C
+" ldr q5, [x23, #16]                         \n\t"
+"                                            \n\t"
+" fmul v0.4s,v0.4s,v7.s[0]                   \n\t" // Scale by beta
+" fmul v1.4s,v1.4s,v7.s[0]                   \n\t" // Scale by beta
+" fmul v2.4s,v2.4s,v7.s[0]                   \n\t" // Scale by beta
+" fmul v3.4s,v3.4s,v7.s[0]                   \n\t" // Scale by beta
+" fmul v4.4s,v4.4s,v7.s[0]                   \n\t" // Scale by beta
+" fmul v5.4s,v5.4s,v7.s[0]                   \n\t" // Scale by beta
+"                                            \n\t"
+" .SBETAZEROCOLSTOREDS3:                     \n\t"
+"                                            \n\t"
+" fmla v0.4s,v20.4s,v6.s[0]                  \n\t" // Scale by alpha
+" fmla v1.4s,v21.4s,v6.s[0]                  \n\t" // Scale by alpha
+" fmla v2.4s,v22.4s,v6.s[0]                  \n\t" // Scale by alpha
+" fmla v3.4s,v23.4s,v6.s[0]                  \n\t" // Scale by alpha
+" fmla v4.4s,v24.4s,v6.s[0]                  \n\t" // Scale by alpha
+" fmla v5.4s,v25.4s,v6.s[0]                  \n\t" // Scale by alpha
+"                                            \n\t"
+" str q0, [x21]                              \n\t" //Store column 6 of C
+" str q1, [x21, #16]                         \n\t"
+" str q2, [x22]                              \n\t" //Store column 7 of C
+" str q3, [x22, #16]                         \n\t"
+" str q4, [x23]                              \n\t" //Store column 8 of C
+" str q5, [x23, #16]                         \n\t"
+"                                            \n\t"
+" dup  v8.4s, wzr                            \n\t"
+" dup  v9.4s, wzr                            \n\t"
+" dup  v10.4s, wzr                            \n\t"
+" dup  v11.4s, wzr                            \n\t"
+" dup  v12.4s, wzr                            \n\t"
+" dup  v13.4s, wzr                            \n\t"
+"                                            \n\t"
+" fcmp s7,#0.0                               \n\t"
+" beq .SBETAZEROCOLSTOREDS4                  \n\t" // Taking care of the beta==0 case.
+"                                            \n\t"
+" ldr q8, [x24]                              \n\t" //Load column 9 of C
+" ldr q9, [x24, #16]                         \n\t"
+" ldr q10, [x25]                             \n\t" //Load column 10 of C
+" ldr q11, [x25, #16]                        \n\t"
+" ldr q12, [x26]                             \n\t" //Load column 11 of C
+" ldr q13, [x26, #16]                        \n\t"
+"                                            \n\t"
+" fmul v8.4s, v8.4s, v7.s[0]                 \n\t" // Scale by beta
+" fmul v9.4s, v9.4s, v7.s[0]                 \n\t" // Scale by beta
+" fmul v10.4s,v10.4s,v7.s[0]                 \n\t" // Scale by beta
+" fmul v11.4s,v11.4s,v7.s[0]                 \n\t" // Scale by beta
+" fmul v12.4s,v12.4s,v7.s[0]                 \n\t" // Scale by beta
+" fmul v13.4s,v13.4s,v7.s[0]                 \n\t" // Scale by beta
+"                                            \n\t"
+" .SBETAZEROCOLSTOREDS4:                     \n\t"
+"                                            \n\t"
+" prfm pldl2keep,[x3]                        \n\t"
+" prfm pldl2keep,[x4]                        \n\t"
+"                                            \n\t"
+" fmla v8.4s, v26.4s,v6.s[0]                 \n\t" // Scale by alpha
+" fmla v9.4s, v27.4s,v6.s[0]                 \n\t" // Scale by alpha
+" fmla v10.4s,v28.4s,v6.s[0]                 \n\t" // Scale by alpha
+" fmla v11.4s,v29.4s,v6.s[0]                 \n\t" // Scale by alpha
+" fmla v12.4s,v30.4s,v6.s[0]                 \n\t" // Scale by alpha
+" fmla v13.4s,v31.4s,v6.s[0]                 \n\t" // Scale by alpha
+"                                            \n\t"
+" str q8, [x24]                              \n\t" //Store column 9 of C
+" str q9, [x24, #16]                         \n\t"
+" str q10, [x25]                             \n\t" //Store column 10 of C
+" str q11, [x25, #16]                        \n\t"
+" str q12, [x26]                             \n\t" //Store column 11 of C
+" str q13, [x26, #16]                        \n\t"
 "                                            \n\t"
-" st1 {v8.4s},[x2],x10                       \n\t" // Store quad into c    and increment by cs_c
-" st1 {v9.4s},[x2],x10                       \n\t" // Store quad into c+4  and increment by cs_c
-" st1 {v10.4s},[x2],x10                      \n\t" // Store quad into c+8  and increment by cs_c
-" st1 {v11.4s},[x2],x10                      \n\t" // Store quad into c+16 and increment by cs_c
 "                                            \n\t"
 " b .SEND                                    \n\t" // Done (TODO: this obviously needs to be moved down to remove jump).
 "                                            \n\t"
 "                                            \n\t"
 " .SGENSTORED:                               \n\t" // C is general-stride stored.
 "                                            \n\t"
-" fcmp s31,#0.0                              \n\t"
-" beq .BETAZEROGENSTORED                     \n\t"
 "                                            \n\t"
-"                                            \n\t" // If beta!=0, then we can read from C.
-"                                            \n\t" // TODO: this was done fast. Rearrange to remove so many address reloads.
-" ldr x2,%[caddr]                            \n\t" // Load address of C.
+" dup  v0.4s, wzr                            \n\t"
+" dup  v1.4s, wzr                            \n\t"
+" dup  v2.4s, wzr                            \n\t"
+" dup  v3.4s, wzr                            \n\t"
+" dup  v4.4s, wzr                            \n\t"
+" dup  v5.4s, wzr                            \n\t"
 "                                            \n\t"
-" ld1 {v8.s}[0],[x2],x14                     \n\t" // Load c00  into quad and increment by rs_c.
-" ld1 {v8.s}[1],[x2],x14                     \n\t" // Load c01  into quad and increment by rs_c.
-" ld1 {v8.s}[2],[x2],x14                     \n\t" // Load c02  into quad and increment by rs_c.
-" ld1 {v8.s}[3],[x2],x14                     \n\t" // Load c03  into quad and increment by rs_c.
+" fcmp s7,#0.0                               \n\t"
+" beq .SBETAZEROGENSTOREDS1                  \n\t" // Taking care of the beta==0 case.
 "                                            \n\t"
-" ldr x2,%[caddr]                            \n\t" // Load address of C.
-" add x2,x2,x10                              \n\t" // c += cs_c.
+" mov x27, x2                                \n\t"
 "                                            \n\t"
-" ld1 {v9.s}[0],[x2],x14                     \n\t" // Load c10  into quad and increment by rs_c.
-" ld1 {v9.s}[1],[x2],x14                     \n\t" // Load c11  into quad and increment by rs_c.
-" ld1 {v9.s}[2],[x2],x14                     \n\t" // Load c12  into quad and increment by rs_c.
-" ld1 {v9.s}[3],[x2],x14                     \n\t" // Load c13  into quad and increment by rs_c.
+" ld1 {v0.s}[0],[x27],x14                    \n\t" // Load c00  into quad and increment by rs_c.
+" ld1 {v0.s}[1],[x27],x14                    \n\t" // Load c01  into quad and increment by rs_c.
+" ld1 {v0.s}[2],[x27],x14                    \n\t" // Load c02  into quad and increment by rs_c.
+" ld1 {v0.s}[3],[x27],x14                    \n\t" // Load c03  into quad and increment by rs_c.
+" ld1 {v1.s}[0],[x27],x14                    \n\t" // Load c04  into quad and increment by rs_c.
+" ld1 {v1.s}[1],[x27],x14                    \n\t" // Load c05  into quad and increment by rs_c.
+" ld1 {v1.s}[2],[x27],x14                    \n\t" // Load c06  into quad and increment by rs_c.
+" ld1 {v1.s}[3],[x27],x14                    \n\t" // Load c07  into quad and increment by rs_c.
 "                                            \n\t"
-" ldr x2,%[caddr]                            \n\t" // Load address of C.
-" add x2,x2,x10                              \n\t" // c += cs_c.
-" add x2,x2,x10                              \n\t" // c += cs_c.
+" mov x27, x16                               \n\t"
 "                                            \n\t"
-" ld1 {v10.s}[0],[x2],x14                    \n\t" // Load c10  into quad and increment by rs_c.
-" ld1 {v10.s}[1],[x2],x14                    \n\t" // Load c11  into quad and increment by rs_c.
-" ld1 {v10.s}[2],[x2],x14                    \n\t" // Load c12  into quad and increment by rs_c.
-" ld1 {v10.s}[3],[x2],x14                    \n\t" // Load c13  into quad and increment by rs_c.
+" ld1 {v2.s}[0],[x27],x14                    \n\t" // Load c10  into quad and increment by rs_c.
+" ld1 {v2.s}[1],[x27],x14                    \n\t" // Load c11  into quad and increment by rs_c.
+" ld1 {v2.s}[2],[x27],x14                    \n\t" // Load c12  into quad and increment by rs_c.
+" ld1 {v2.s}[3],[x27],x14                    \n\t" // Load c13  into quad and increment by rs_c.
+" ld1 {v3.s}[0],[x27],x14                    \n\t" // Load c14  into quad and increment by rs_c.
+" ld1 {v3.s}[1],[x27],x14                    \n\t" // Load c15  into quad and increment by rs_c.
+" ld1 {v3.s}[2],[x27],x14                    \n\t" // Load c16  into quad and increment by rs_c.
+" ld1 {v3.s}[3],[x27],x14                    \n\t" // Load c17  into quad and increment by rs_c.
 "                                            \n\t"
-" ldr x2,%[caddr]                            \n\t" // Load address of C.
-" add x2,x2,x10                              \n\t" // c += cs_c.
-" add x2,x2,x10                              \n\t" // c += cs_c.
-" add x2,x2,x10                              \n\t" // c += cs_c.
+" mov x27, x17                               \n\t"
 "                                            \n\t"
-" ld1 {v11.s}[0],[x2],x14                    \n\t" // Load c10  into quad and increment by rs_c.
-" ld1 {v11.s}[1],[x2],x14                    \n\t" // Load c11  into quad and increment by rs_c.
-" ld1 {v11.s}[2],[x2],x14                    \n\t" // Load c12  into quad and increment by rs_c.
-" ld1 {v11.s}[3],[x2],x14                    \n\t" // Load c13  into quad and increment by rs_c.
+" ld1 {v4.s}[0],[x27],x14                    \n\t" // Load c20  into quad and increment by rs_c.
+" ld1 {v4.s}[1],[x27],x14                    \n\t" // Load c21  into quad and increment by rs_c.
+" ld1 {v4.s}[2],[x27],x14                    \n\t" // Load c22  into quad and increment by rs_c.
+" ld1 {v4.s}[3],[x27],x14                    \n\t" // Load c23  into quad and increment by rs_c.
+" ld1 {v5.s}[0],[x27],x14                    \n\t" // Load c24  into quad and increment by rs_c.
+" ld1 {v5.s}[1],[x27],x14                    \n\t" // Load c25  into quad and increment by rs_c.
+" ld1 {v5.s}[2],[x27],x14                    \n\t" // Load c26  into quad and increment by rs_c.
+" ld1 {v5.s}[3],[x27],x14                    \n\t" // Load c27  into quad and increment by rs_c.
 "                                            \n\t"
+" fmul v0.4s,v0.4s,v7.s[0]                   \n\t" // Scale by beta
+" fmul v1.4s,v1.4s,v7.s[0]                   \n\t" // Scale by beta
+" fmul v2.4s,v2.4s,v7.s[0]                   \n\t" // Scale by beta
+" fmul v3.4s,v3.4s,v7.s[0]                   \n\t" // Scale by beta
+" fmul v4.4s,v4.4s,v7.s[0]                   \n\t" // Scale by beta
+" fmul v5.4s,v5.4s,v7.s[0]                   \n\t" // Scale by beta
 "                                            \n\t"
-" prfm pldl1keep,[x16,0]                     \n\t" // Prefetch.
-" prfm pldl1keep,[x17,0]                     \n\t" // Prefetch.
+" .SBETAZEROGENSTOREDS1:                     \n\t"
 "                                            \n\t"
-" fmul v8.4s,v8.4s,v31.s[0]                  \n\t" // Scale by beta.
-" fmul v9.4s,v9.4s,v31.s[0]                  \n\t" // Scale by beta.
-" fmul v10.4s,v10.4s,v31.s[0]                \n\t" // Scale by beta.
-" fmul v11.4s,v11.4s,v31.s[0]                \n\t" // Scale by beta.
+" fmla v0.4s, v8.4s,v6.s[0]                  \n\t" // Scale by alpha
+" fmla v1.4s, v9.4s,v6.s[0]                  \n\t" // Scale by alpha
+" fmla v2.4s,v10.4s,v6.s[0]                  \n\t" // Scale by alpha
+" fmla v3.4s,v11.4s,v6.s[0]                  \n\t" // Scale by alpha
+" fmla v4.4s,v12.4s,v6.s[0]                  \n\t" // Scale by alpha
+" fmla v5.4s,v13.4s,v6.s[0]                  \n\t" // Scale by alpha
 "                                            \n\t"
-" .BETAZEROGENSTORED:                        \n\t" // If beta==0, we cannot read from C (nor scale).
+" mov x27, x2                                \n\t"
 "                                            \n\t"
-" fmla v8.4s,v12.4s,v30.s[0]                 \n\t" // Scale by alpha.
-" fmla v9.4s,v13.4s,v30.s[0]                 \n\t" // Scale by alpha.
-" fmla v10.4s,v14.4s,v30.s[0]                \n\t" // Scale by alpha.
-" fmla v11.4s,v15.4s,v30.s[0]                \n\t" // Scale by alpha.
+" st1 {v0.s}[0],[x27],x14                    \n\t" // Store c00  into quad and increment by rs_c.
+" st1 {v0.s}[1],[x27],x14                    \n\t" // Store c01  into quad and increment by rs_c.
+" st1 {v0.s}[2],[x27],x14                    \n\t" // Store c02  into quad and increment by rs_c.
+" st1 {v0.s}[3],[x27],x14                    \n\t" // Store c03  into quad and increment by rs_c.
+" st1 {v1.s}[0],[x27],x14                    \n\t" // Store c04  into quad and increment by rs_c.
+" st1 {v1.s}[1],[x27],x14                    \n\t" // Store c05  into quad and increment by rs_c.
+" st1 {v1.s}[2],[x27],x14                    \n\t" // Store c06  into quad and increment by rs_c.
+" st1 {v1.s}[3],[x27],x14                    \n\t" // Store c07  into quad and increment by rs_c.
 "                                            \n\t"
+" mov x27, x16                               \n\t"
 "                                            \n\t"
-" ldr x2,%[caddr]                            \n\t" // Load address of C.
+" st1 {v2.s}[0],[x27],x14                    \n\t" // Store c10  into quad and increment by rs_c.
+" st1 {v2.s}[1],[x27],x14                    \n\t" // Store c11  into quad and increment by rs_c.
+" st1 {v2.s}[2],[x27],x14                    \n\t" // Store c12  into quad and increment by rs_c.
+" st1 {v2.s}[3],[x27],x14                    \n\t" // Store c13  into quad and increment by rs_c.
+" st1 {v3.s}[0],[x27],x14                    \n\t" // Store c14  into quad and increment by rs_c.
+" st1 {v3.s}[1],[x27],x14                    \n\t" // Store c15  into quad and increment by rs_c.
+" st1 {v3.s}[2],[x27],x14                    \n\t" // Store c16  into quad and increment by rs_c.
+" st1 {v3.s}[3],[x27],x14                    \n\t" // Store c17  into quad and increment by rs_c.
 "                                            \n\t"
-" st1 {v8.s}[0],[x2],x14                     \n\t" // Store c00  into quad and increment by rs_c.
-" st1 {v8.s}[1],[x2],x14                     \n\t" // Store c01  into quad and increment by rs_c.
-" st1 {v8.s}[2],[x2],x14                     \n\t" // Store c02  into quad and increment by rs_c.
-" st1 {v8.s}[3],[x2],x14                     \n\t" // Store c03  into quad and increment by rs_c.
+" mov x27, x17                               \n\t"
 "                                            \n\t"
-" ldr x2,%[caddr]                            \n\t" // Load address of C.
-" add x2,x2,x10                              \n\t" // c += cs_c.
+" st1 {v4.s}[0],[x27],x14                    \n\t" // Store c20  into quad and increment by rs_c.
+" st1 {v4.s}[1],[x27],x14                    \n\t" // Store c21  into quad and increment by rs_c.
+" st1 {v4.s}[2],[x27],x14                    \n\t" // Store c22  into quad and increment by rs_c.
+" st1 {v4.s}[3],[x27],x14                    \n\t" // Store c23  into quad and increment by rs_c.
+" st1 {v5.s}[0],[x27],x14                    \n\t" // Store c24  into quad and increment by rs_c.
+" st1 {v5.s}[1],[x27],x14                    \n\t" // Store c25  into quad and increment by rs_c.
+" st1 {v5.s}[2],[x27],x14                    \n\t" // Store c26  into quad and increment by rs_c.
+" st1 {v5.s}[3],[x27],x14                    \n\t" // Store c27  into quad and increment by rs_c.
 "                                            \n\t"
-" st1 {v9.s}[0],[x2],x14                     \n\t" // Store c10  into quad and increment by rs_c.
-" st1 {v9.s}[1],[x2],x14                     \n\t" // Store c11  into quad and increment by rs_c.
-" st1 {v9.s}[2],[x2],x14                     \n\t" // Store c12  into quad and increment by rs_c.
-" st1 {v9.s}[3],[x2],x14                     \n\t" // Store c13  into quad and increment by rs_c.
+" dup  v8.4s, wzr                            \n\t"
+" dup  v9.4s, wzr                            \n\t"
+" dup  v10.4s, wzr                           \n\t"
+" dup  v11.4s, wzr                           \n\t"
+" dup  v12.4s, wzr                           \n\t"
+" dup  v13.4s, wzr                           \n\t"
 "                                            \n\t"
-" ldr x2,%[caddr]                            \n\t" // Load address of C.
-" add x2,x2,x10                              \n\t" // c += cs_c.
-" add x2,x2,x10                              \n\t" // c += cs_c.
+" fcmp s7,#0.0                               \n\t"
+" beq .SBETAZEROGENSTOREDS2                  \n\t" // Taking care of the beta==0 case.
 "                                            \n\t"
-" st1 {v10.s}[0],[x2],x14                    \n\t" // Store c10  into quad and increment by rs_c.
-" st1 {v10.s}[1],[x2],x14                    \n\t" // Store c11  into quad and increment by rs_c.
-" st1 {v10.s}[2],[x2],x14                    \n\t" // Store c12  into quad and increment by rs_c.
-" st1 {v10.s}[3],[x2],x14                    \n\t" // Store c13  into quad and increment by rs_c.
+" mov x27, x18                               \n\t"
 "                                            \n\t"
-" ldr x2,%[caddr]                            \n\t" // Load address of C.
-" add x2,x2,x10                              \n\t" // c += cs_c.
-" add x2,x2,x10                              \n\t" // c += cs_c.
-" add x2,x2,x10                              \n\t" // c += cs_c.
+" ld1 {v8.s}[0],[x27],x14                    \n\t" // Load c30  into quad and increment by rs_c.
+" ld1 {v8.s}[1],[x27],x14                    \n\t" // Load c31  into quad and increment by rs_c.
+" ld1 {v8.s}[2],[x27],x14                    \n\t" // Load c32  into quad and increment by rs_c.
+" ld1 {v8.s}[3],[x27],x14                    \n\t" // Load c33  into quad and increment by rs_c.
+" ld1 {v9.s}[0],[x27],x14                    \n\t" // Load c34  into quad and increment by rs_c.
+" ld1 {v9.s}[1],[x27],x14                    \n\t" // Load c35  into quad and increment by rs_c.
+" ld1 {v9.s}[2],[x27],x14                    \n\t" // Load c36  into quad and increment by rs_c.
+" ld1 {v9.s}[3],[x27],x14                    \n\t" // Load c37  into quad and increment by rs_c.
 "                                            \n\t"
-" st1 {v11.s}[0],[x2],x14                    \n\t" // Store c10  into quad and increment by rs_c.
-" st1 {v11.s}[1],[x2],x14                    \n\t" // Store c11  into quad and increment by rs_c.
-" st1 {v11.s}[2],[x2],x14                    \n\t" // Store c12  into quad and increment by rs_c.
-" st1 {v11.s}[3],[x2],x14                    \n\t" // Store c13  into quad and increment by rs_c.
+" mov x27, x19                               \n\t"
 "                                            \n\t"
+" ld1 {v10.s}[0],[x27],x14                   \n\t" // Load c40  into quad and increment by rs_c.
+" ld1 {v10.s}[1],[x27],x14                   \n\t" // Load c41  into quad and increment by rs_c.
+" ld1 {v10.s}[2],[x27],x14                   \n\t" // Load c42  into quad and increment by rs_c.
+" ld1 {v10.s}[3],[x27],x14                   \n\t" // Load c43  into quad and increment by rs_c.
+" ld1 {v11.s}[0],[x27],x14                   \n\t" // Load c44  into quad and increment by rs_c.
+" ld1 {v11.s}[1],[x27],x14                   \n\t" // Load c45  into quad and increment by rs_c.
+" ld1 {v11.s}[2],[x27],x14                   \n\t" // Load c46  into quad and increment by rs_c.
+" ld1 {v11.s}[3],[x27],x14                   \n\t" // Load c47  into quad and increment by rs_c.
 "                                            \n\t"
+" mov x27, x20                               \n\t"
+"                                            \n\t"
+" ld1 {v12.s}[0],[x27],x14                   \n\t" // Load c50  into quad and increment by rs_c.
+" ld1 {v12.s}[1],[x27],x14                   \n\t" // Load c51  into quad and increment by rs_c.
+" ld1 {v12.s}[2],[x27],x14                   \n\t" // Load c52  into quad and increment by rs_c.
+" ld1 {v12.s}[3],[x27],x14                   \n\t" // Load c53  into quad and increment by rs_c.
+" ld1 {v13.s}[0],[x27],x14                   \n\t" // Load c54  into quad and increment by rs_c.
+" ld1 {v13.s}[1],[x27],x14                   \n\t" // Load c55  into quad and increment by rs_c.
+" ld1 {v13.s}[2],[x27],x14                   \n\t" // Load c56  into quad and increment by rs_c.
+" ld1 {v13.s}[3],[x27],x14                   \n\t" // Load c57  into quad and increment by rs_c.
+"                                            \n\t"
+" fmul v8.4s, v8.4s, v7.s[0]                 \n\t" // Scale by beta
+" fmul v9.4s, v9.4s, v7.s[0]                 \n\t" // Scale by beta
+" fmul v10.4s,v10.4s,v7.s[0]                 \n\t" // Scale by beta
+" fmul v11.4s,v11.4s,v7.s[0]                 \n\t" // Scale by beta
+" fmul v12.4s,v12.4s,v7.s[0]                 \n\t" // Scale by beta
+" fmul v13.4s,v13.4s,v7.s[0]                 \n\t" // Scale by beta
+"                                            \n\t"
+" .SBETAZEROGENSTOREDS2:                     \n\t"
+"                                            \n\t"
+" fmla v8.4s, v14.4s,v6.s[0]                 \n\t" // Scale by alpha
+" fmla v9.4s, v15.4s,v6.s[0]                 \n\t" // Scale by alpha
+" fmla v10.4s,v16.4s,v6.s[0]                 \n\t" // Scale by alpha
+" fmla v11.4s,v17.4s,v6.s[0]                 \n\t" // Scale by alpha
+" fmla v12.4s,v18.4s,v6.s[0]                 \n\t" // Scale by alpha
+" fmla v13.4s,v19.4s,v6.s[0]                 \n\t" // Scale by alpha
+"                                            \n\t"
+" mov x27, x18                               \n\t"
+"                                            \n\t"
+" st1 {v8.s}[0],[x27],x14                    \n\t" // Store c30  into quad and increment by rs_c.
+" st1 {v8.s}[1],[x27],x14                    \n\t" // Store c31  into quad and increment by rs_c.
+" st1 {v8.s}[2],[x27],x14                    \n\t" // Store c32  into quad and increment by rs_c.
+" st1 {v8.s}[3],[x27],x14                    \n\t" // Store c33  into quad and increment by rs_c.
+" st1 {v9.s}[0],[x27],x14                    \n\t" // Store c34  into quad and increment by rs_c.
+" st1 {v9.s}[1],[x27],x14                    \n\t" // Store c35  into quad and increment by rs_c.
+" st1 {v9.s}[2],[x27],x14                    \n\t" // Store c36  into quad and increment by rs_c.
+" st1 {v9.s}[3],[x27],x14                    \n\t" // Store c37  into quad and increment by rs_c.
+"                                            \n\t"
+" mov x27, x19                               \n\t"
+"                                            \n\t"
+" st1 {v10.s}[0],[x27],x14                   \n\t" // Store c40  into quad and increment by rs_c.
+" st1 {v10.s}[1],[x27],x14                   \n\t" // Store c41  into quad and increment by rs_c.
+" st1 {v10.s}[2],[x27],x14                   \n\t" // Store c42  into quad and increment by rs_c.
+" st1 {v10.s}[3],[x27],x14                   \n\t" // Store c43  into quad and increment by rs_c.
+" st1 {v11.s}[0],[x27],x14                   \n\t" // Store c44  into quad and increment by rs_c.
+" st1 {v11.s}[1],[x27],x14                   \n\t" // Store c45  into quad and increment by rs_c.
+" st1 {v11.s}[2],[x27],x14                   \n\t" // Store c46  into quad and increment by rs_c.
+" st1 {v11.s}[3],[x27],x14                   \n\t" // Store c47  into quad and increment by rs_c.
+"                                            \n\t"
+" mov x27, x20                               \n\t"
+"                                            \n\t"
+" st1 {v12.s}[0],[x27],x14                   \n\t" // Store c50  into quad and increment by rs_c.
+" st1 {v12.s}[1],[x27],x14                   \n\t" // Store c51  into quad and increment by rs_c.
+" st1 {v12.s}[2],[x27],x14                   \n\t" // Store c52  into quad and increment by rs_c.
+" st1 {v12.s}[3],[x27],x14                   \n\t" // Store c53  into quad and increment by rs_c.
+" st1 {v13.s}[0],[x27],x14                   \n\t" // Store c54  into quad and increment by rs_c.
+" st1 {v13.s}[1],[x27],x14                   \n\t" // Store c55  into quad and increment by rs_c.
+" st1 {v13.s}[2],[x27],x14                   \n\t" // Store c56  into quad and increment by rs_c.
+" st1 {v13.s}[3],[x27],x14                   \n\t" // Store c57  into quad and increment by rs_c.
+"                                            \n\t"
+" dup  v0.4s, wzr                            \n\t"
+" dup  v1.4s, wzr                            \n\t"
+" dup  v2.4s, wzr                            \n\t"
+" dup  v3.4s, wzr                            \n\t"
+" dup  v4.4s, wzr                            \n\t"
+" dup  v5.4s, wzr                            \n\t"
+"                                            \n\t"
+" fcmp s7,#0.0                               \n\t"
+" beq .SBETAZEROGENSTOREDS3                  \n\t" // Taking care of the beta==0 case.
+"                                            \n\t"
+" mov x27, x21                               \n\t"
+"                                            \n\t"
+" ld1 {v0.s}[0],[x27],x14                    \n\t" // Load c60  into quad and increment by rs_c.
+" ld1 {v0.s}[1],[x27],x14                    \n\t" // Load c61  into quad and increment by rs_c.
+" ld1 {v0.s}[2],[x27],x14                    \n\t" // Load c62  into quad and increment by rs_c.
+" ld1 {v0.s}[3],[x27],x14                    \n\t" // Load c63  into quad and increment by rs_c.
+" ld1 {v1.s}[0],[x27],x14                    \n\t" // Load c64  into quad and increment by rs_c.
+" ld1 {v1.s}[1],[x27],x14                    \n\t" // Load c65  into quad and increment by rs_c.
+" ld1 {v1.s}[2],[x27],x14                    \n\t" // Load c66  into quad and increment by rs_c.
+" ld1 {v1.s}[3],[x27],x14                    \n\t" // Load c67  into quad and increment by rs_c.
+"                                            \n\t"
+" mov x27, x22                               \n\t"
+"                                            \n\t"
+" ld1 {v2.s}[0],[x27],x14                    \n\t" // Load c70  into quad and increment by rs_c.
+" ld1 {v2.s}[1],[x27],x14                    \n\t" // Load c71  into quad and increment by rs_c.
+" ld1 {v2.s}[2],[x27],x14                    \n\t" // Load c72  into quad and increment by rs_c.
+" ld1 {v2.s}[3],[x27],x14                    \n\t" // Load c73  into quad and increment by rs_c.
+" ld1 {v3.s}[0],[x27],x14                    \n\t" // Load c74  into quad and increment by rs_c.
+" ld1 {v3.s}[1],[x27],x14                    \n\t" // Load c75  into quad and increment by rs_c.
+" ld1 {v3.s}[2],[x27],x14                    \n\t" // Load c76  into quad and increment by rs_c.
+" ld1 {v3.s}[3],[x27],x14                    \n\t" // Load c77  into quad and increment by rs_c.
+"                                            \n\t"
+" mov x27, x23                               \n\t"
+"                                            \n\t"
+" ld1 {v4.s}[0],[x27],x14                    \n\t" // Load c80  into quad and increment by rs_c.
+" ld1 {v4.s}[1],[x27],x14                    \n\t" // Load c81  into quad and increment by rs_c.
+" ld1 {v4.s}[2],[x27],x14                    \n\t" // Load c82  into quad and increment by rs_c.
+" ld1 {v4.s}[3],[x27],x14                    \n\t" // Load c83  into quad and increment by rs_c.
+" ld1 {v5.s}[0],[x27],x14                    \n\t" // Load c84  into quad and increment by rs_c.
+" ld1 {v5.s}[1],[x27],x14                    \n\t" // Load c85  into quad and increment by rs_c.
+" ld1 {v5.s}[2],[x27],x14                    \n\t" // Load c86  into quad and increment by rs_c.
+" ld1 {v5.s}[3],[x27],x14                    \n\t" // Load c87  into quad and increment by rs_c.
+"                                            \n\t"
+" fmul v0.4s,v0.4s,v7.s[0]                   \n\t" // Scale by beta
+" fmul v1.4s,v1.4s,v7.s[0]                   \n\t" // Scale by beta
+" fmul v2.4s,v2.4s,v7.s[0]                   \n\t" // Scale by beta
+" fmul v3.4s,v3.4s,v7.s[0]                   \n\t" // Scale by beta
+" fmul v4.4s,v4.4s,v7.s[0]                   \n\t" // Scale by beta
+" fmul v5.4s,v5.4s,v7.s[0]                   \n\t" // Scale by beta
+"                                            \n\t"
+" .SBETAZEROGENSTOREDS3:                     \n\t"
+"                                            \n\t"
+" fmla v0.4s,v20.4s,v6.s[0]                  \n\t" // Scale by alpha
+" fmla v1.4s,v21.4s,v6.s[0]                  \n\t" // Scale by alpha
+" fmla v2.4s,v22.4s,v6.s[0]                  \n\t" // Scale by alpha
+" fmla v3.4s,v23.4s,v6.s[0]                  \n\t" // Scale by alpha
+" fmla v4.4s,v24.4s,v6.s[0]                  \n\t" // Scale by alpha
+" fmla v5.4s,v25.4s,v6.s[0]                  \n\t" // Scale by alpha
+"                                            \n\t"
+" mov x27, x21                               \n\t"
+"                                            \n\t"
+" st1 {v0.s}[0],[x27],x14                    \n\t" // Store c60  into quad and increment by rs_c.
+" st1 {v0.s}[1],[x27],x14                    \n\t" // Store c61  into quad and increment by rs_c.
+" st1 {v0.s}[2],[x27],x14                    \n\t" // Store c62  into quad and increment by rs_c.
+" st1 {v0.s}[3],[x27],x14                    \n\t" // Store c63  into quad and increment by rs_c.
+" st1 {v1.s}[0],[x27],x14                    \n\t" // Store c64  into quad and increment by rs_c.
+" st1 {v1.s}[1],[x27],x14                    \n\t" // Store c65  into quad and increment by rs_c.
+" st1 {v1.s}[2],[x27],x14                    \n\t" // Store c66  into quad and increment by rs_c.
+" st1 {v1.s}[3],[x27],x14                    \n\t" // Store c67  into quad and increment by rs_c.
+"                                            \n\t"
+" mov x27, x22                               \n\t"
+"                                            \n\t"
+" st1 {v2.s}[0],[x27],x14                    \n\t" // Store c70  into quad and increment by rs_c.
+" st1 {v2.s}[1],[x27],x14                    \n\t" // Store c71  into quad and increment by rs_c.
+" st1 {v2.s}[2],[x27],x14                    \n\t" // Store c72  into quad and increment by rs_c.
+" st1 {v2.s}[3],[x27],x14                    \n\t" // Store c73  into quad and increment by rs_c.
+" st1 {v3.s}[0],[x27],x14                    \n\t" // Store c74  into quad and increment by rs_c.
+" st1 {v3.s}[1],[x27],x14                    \n\t" // Store c75  into quad and increment by rs_c.
+" st1 {v3.s}[2],[x27],x14                    \n\t" // Store c76  into quad and increment by rs_c.
+" st1 {v3.s}[3],[x27],x14                    \n\t" // Store c77  into quad and increment by rs_c.
+"                                            \n\t"
+" mov x27, x23                               \n\t"
+"                                            \n\t"
+" st1 {v4.s}[0],[x27],x14                    \n\t" // Store c80  into quad and increment by rs_c.
+" st1 {v4.s}[1],[x27],x14                    \n\t" // Store c81  into quad and increment by rs_c.
+" st1 {v4.s}[2],[x27],x14                    \n\t" // Store c82  into quad and increment by rs_c.
+" st1 {v4.s}[3],[x27],x14                    \n\t" // Store c83  into quad and increment by rs_c.
+" st1 {v5.s}[0],[x27],x14                    \n\t" // Store c84  into quad and increment by rs_c.
+" st1 {v5.s}[1],[x27],x14                    \n\t" // Store c85  into quad and increment by rs_c.
+" st1 {v5.s}[2],[x27],x14                    \n\t" // Store c86  into quad and increment by rs_c.
+" st1 {v5.s}[3],[x27],x14                    \n\t" // Store c87  into quad and increment by rs_c.
+"                                            \n\t"
+" dup  v8.4s, wzr                            \n\t"
+" dup  v9.4s, wzr                            \n\t"
+" dup  v10.4s, wzr                           \n\t"
+" dup  v11.4s, wzr                           \n\t"
+" dup  v12.4s, wzr                           \n\t"
+" dup  v13.4s, wzr                           \n\t"
+"                                            \n\t"
+" fcmp s7,#0.0                               \n\t"
+" beq .SBETAZEROGENSTOREDS4                  \n\t" // Taking care of the beta==0 case.
+"                                            \n\t"
+" mov x27, x24                               \n\t"
+"                                            \n\t"
+" ld1 {v8.s}[0],[x27],x14                    \n\t" // Load c90  into quad and increment by rs_c.
+" ld1 {v8.s}[1],[x27],x14                    \n\t" // Load c91  into quad and increment by rs_c.
+" ld1 {v8.s}[2],[x27],x14                    \n\t" // Load c92  into quad and increment by rs_c.
+" ld1 {v8.s}[3],[x27],x14                    \n\t" // Load c93  into quad and increment by rs_c.
+" ld1 {v9.s}[0],[x27],x14                    \n\t" // Load c94  into quad and increment by rs_c.
+" ld1 {v9.s}[1],[x27],x14                    \n\t" // Load c95  into quad and increment by rs_c.
+" ld1 {v9.s}[2],[x27],x14                    \n\t" // Load c96  into quad and increment by rs_c.
+" ld1 {v9.s}[3],[x27],x14                    \n\t" // Load c97  into quad and increment by rs_c.
+"                                            \n\t"
+" mov x27, x25                               \n\t"
+"                                            \n\t"
+" ld1 {v10.s}[0],[x27],x14                   \n\t" // Load c100  into quad and increment by rs_c.
+" ld1 {v10.s}[1],[x27],x14                   \n\t" // Load c101  into quad and increment by rs_c.
+" ld1 {v10.s}[2],[x27],x14                   \n\t" // Load c102  into quad and increment by rs_c.
+" ld1 {v10.s}[3],[x27],x14                   \n\t" // Load c103  into quad and increment by rs_c.
+" ld1 {v11.s}[0],[x27],x14                   \n\t" // Load c104  into quad and increment by rs_c.
+" ld1 {v11.s}[1],[x27],x14                   \n\t" // Load c105  into quad and increment by rs_c.
+" ld1 {v11.s}[2],[x27],x14                   \n\t" // Load c106  into quad and increment by rs_c.
+" ld1 {v11.s}[3],[x27],x14                   \n\t" // Load c107  into quad and increment by rs_c.
+"                                            \n\t"
+" mov x27, x26                               \n\t"
+"                                            \n\t"
+" ld1 {v12.s}[0],[x27],x14                   \n\t" // Load c110  into quad and increment by rs_c.
+" ld1 {v12.s}[1],[x27],x14                   \n\t" // Load c111  into quad and increment by rs_c.
+" ld1 {v12.s}[2],[x27],x14                   \n\t" // Load c112  into quad and increment by rs_c.
+" ld1 {v12.s}[3],[x27],x14                   \n\t" // Load c113  into quad and increment by rs_c.
+" ld1 {v13.s}[0],[x27],x14                   \n\t" // Load c114  into quad and increment by rs_c.
+" ld1 {v13.s}[1],[x27],x14                   \n\t" // Load c115  into quad and increment by rs_c.
+" ld1 {v13.s}[2],[x27],x14                   \n\t" // Load c116  into quad and increment by rs_c.
+" ld1 {v13.s}[3],[x27],x14                   \n\t" // Load c117  into quad and increment by rs_c.
+"                                            \n\t"
+" fmul v8.4s, v8.4s, v7.s[0]                 \n\t" // Scale by beta
+" fmul v9.4s, v9.4s, v7.s[0]                 \n\t" // Scale by beta
+" fmul v10.4s,v10.4s,v7.s[0]                 \n\t" // Scale by beta
+" fmul v11.4s,v11.4s,v7.s[0]                 \n\t" // Scale by beta
+" fmul v12.4s,v12.4s,v7.s[0]                 \n\t" // Scale by beta
+" fmul v13.4s,v13.4s,v7.s[0]                 \n\t" // Scale by beta
+"                                            \n\t"
+" .SBETAZEROGENSTOREDS4:                     \n\t"
+"                                            \n\t"
+" prfm pldl2keep,[x3]                        \n\t"
+" prfm pldl2keep,[x4]                        \n\t"
+"                                            \n\t"
+" fmla v8.4s, v26.4s,v6.s[0]                 \n\t" // Scale by alpha
+" fmla v9.4s, v27.4s,v6.s[0]                 \n\t" // Scale by alpha
+" fmla v10.4s,v28.4s,v6.s[0]                 \n\t" // Scale by alpha
+" fmla v11.4s,v29.4s,v6.s[0]                 \n\t" // Scale by alpha
+" fmla v12.4s,v30.4s,v6.s[0]                 \n\t" // Scale by alpha
+" fmla v13.4s,v31.4s,v6.s[0]                 \n\t" // Scale by alpha
+"                                            \n\t"
+" mov x27, x24                               \n\t"
+"                                            \n\t"
+" st1 {v8.s}[0],[x27],x14                    \n\t" // Store c90  into quad and increment by rs_c.
+" st1 {v8.s}[1],[x27],x14                    \n\t" // Store c91  into quad and increment by rs_c.
+" st1 {v8.s}[2],[x27],x14                    \n\t" // Store c92  into quad and increment by rs_c.
+" st1 {v8.s}[3],[x27],x14                    \n\t" // Store c93  into quad and increment by rs_c.
+" st1 {v9.s}[0],[x27],x14                    \n\t" // Store c94  into quad and increment by rs_c.
+" st1 {v9.s}[1],[x27],x14                    \n\t" // Store c95  into quad and increment by rs_c.
+" st1 {v9.s}[2],[x27],x14                    \n\t" // Store c96  into quad and increment by rs_c.
+" st1 {v9.s}[3],[x27],x14                    \n\t" // Store c97  into quad and increment by rs_c.
+"                                            \n\t"
+" mov x27, x25                               \n\t"
+"                                            \n\t"
+" st1 {v10.s}[0],[x27],x14                   \n\t" // Store c100  into quad and increment by rs_c.
+" st1 {v10.s}[1],[x27],x14                   \n\t" // Store c101  into quad and increment by rs_c.
+" st1 {v10.s}[2],[x27],x14                   \n\t" // Store c102  into quad and increment by rs_c.
+" st1 {v10.s}[3],[x27],x14                   \n\t" // Store c103  into quad and increment by rs_c.
+" st1 {v11.s}[0],[x27],x14                   \n\t" // Store c104  into quad and increment by rs_c.
+" st1 {v11.s}[1],[x27],x14                   \n\t" // Store c105  into quad and increment by rs_c.
+" st1 {v11.s}[2],[x27],x14                   \n\t" // Store c106  into quad and increment by rs_c.
+" st1 {v11.s}[3],[x27],x14                   \n\t" // Store c107  into quad and increment by rs_c.
+"                                            \n\t"
+" mov x27, x26                               \n\t"
+"                                            \n\t"
+" st1 {v12.s}[0],[x27],x14                   \n\t" // Store c110  into quad and increment by rs_c.
+" st1 {v12.s}[1],[x27],x14                   \n\t" // Store c111  into quad and increment by rs_c.
+" st1 {v12.s}[2],[x27],x14                   \n\t" // Store c112  into quad and increment by rs_c.
+" st1 {v12.s}[3],[x27],x14                   \n\t" // Store c113  into quad and increment by rs_c.
+" st1 {v13.s}[0],[x27],x14                   \n\t" // Store c114  into quad and increment by rs_c.
+" st1 {v13.s}[1],[x27],x14                   \n\t" // Store c115  into quad and increment by rs_c.
+" st1 {v13.s}[2],[x27],x14                   \n\t" // Store c116  into quad and increment by rs_c.
+" st1 {v13.s}[3],[x27],x14                   \n\t" // Store c147  into quad and increment by rs_c.
 "                                            \n\t"
 " .SEND:                                     \n\t" // Done!
 "                                            \n\t"
@@ -410,10 +1060,13 @@ __asm__ volatile
  [b_next] "m" (b_next), // 10
  [k]      "m" (k)       // 11
 :// Register clobber list
- "x0", "x1", "x2", "x4",
+ "x0", "x1", "x2","x3","x4",
  "x5", "x6", "x7", "x8",
  "x9", "x10","x11","x12",
- "x13","x14","x20",
+ "x13","x14","x15",
+ "x16","x17","x18","x19",       
+ "x20","x21","x22","x23",
+ "x24","x25","x26","x27",
  "v0", "v1", "v2", "v3",
  "v4", "v5", "v6", "v7",
  "v8", "v9", "v10","v11",
@@ -421,17 +1074,32 @@ __asm__ volatile
  "v16","v17","v18","v19",
  "v20","v21","v22","v23",
  "v24","v25","v26","v27",
- "v30","v31"
+ "v28","v29","v30","v31"
 );
 
 }
 
 
 /*
+   o 4x4 Double precision micro-kernel NOT fully functional yet.
+   o Runnable on ARMv8, compiled with aarch64 GCC.
+   o Use it together with the armv8 BLIS configuration.
+   o Tested on Juno board. Around 3 GFLOPS @ 1.1 GHz. 
+
+   December 2014.
+  
+ * UPDATE OCTOBER 2015: Now is fully functional.
  * Tested on Juno board. Around 5.6 GFLOPS, 2 A57 cores @ 1.1 GHz.
  * Tested on Juno board. Around 4 GFLOPS, 4 A53 cores @ 850 MHz.
+ 
+ * UPDATE NOVEMBER 2015
+ * Micro-kernel changed to 6x8
+ * Tested on Juno Board. Around 4   GFLOPS, 1 x A57 core  @ 1.1 GHz.
+ * Tested on Juno Board. Around 7.6 GFLOPS, 2 x A57 cores @ 1.1 GHz.
+ * Tested on Juno board. Around 1.5 GFLOPS, 1 x A53 core  @ 850 MHz. 
+ * Tested on Juno board. Around 5.5 GFLOPS, 4 x A53 cores @ 850 MHz.
 */
-void bli_dgemm_opt_4x4(
+void bli_dgemm_opt_6x8(
                         dim_t              k,
                         double*   restrict alpha,
                         double*   restrict a,
@@ -444,8 +1112,8 @@ void bli_dgemm_opt_4x4(
 	void* a_next = bli_auxinfo_next_a( data );
 	void* b_next = bli_auxinfo_next_b( data );
 
-	dim_t k_iter = k / 2;
-	dim_t k_left = k % 2;
+	dim_t k_iter = k / 4;
+	dim_t k_left = k % 4;
 
 __asm__ volatile
 (
@@ -454,10 +1122,8 @@ __asm__ volatile
 " ldr x1,%[baddr]                            \n\t" // Load address of B
 " ldr x2,%[caddr]                            \n\t" // Load address of C
 "                                            \n\t"
-" mov x4,#0                                  \n\t" // Init loop counter (i=0)
-"                                            \n\t"
-" ldr x16,%[a_next]                          \n\t" // Move pointer
-" ldr x17,%[b_next]                          \n\t" // Move pointer
+" ldr x3,%[a_next]                           \n\t" // Move pointer
+" ldr x4,%[b_next]                           \n\t" // Move pointer
 "                                            \n\t"
 " ldr x5,%[k_iter]                           \n\t" // Init guard (k_iter)
 " ldr x6,%[k_left]                           \n\t" // Init guard (k_iter)
@@ -467,123 +1133,414 @@ __asm__ volatile
 "                                            \n\t" 
 " ldr x9,%[cs_c]                             \n\t" // Load cs_c
 " lsl x10,x9,#3                              \n\t" // cs_c * sizeof(double)
-" lsl x11,x9,#4                              \n\t" // 2 * cs_c * sizeof(double) -- AUX.
-" lsl x12,x9,#5                              \n\t" // 3 * cs_c * sizeof(double) -- AUX.
 "                                            \n\t"
 " ldr x13,%[rs_c]                            \n\t" // Load rs_c.
 " lsl x14,x13,#3                             \n\t" // rs_c * sizeof(double). 
 "                                            \n\t"
-" prfm pldl1keep,[x2,0]                      \n\t" // Prefetch c.
-" prfm pldl1keep,[x2,x10]                    \n\t" // Prefetch c.
-" prfm pldl1keep,[x2,x11]                    \n\t" // Prefetch c.
-" prfm pldl1keep,[x2,x12]                    \n\t" // Prefetch c.
+" add x20,x2,x10                             \n\t" //Load address Column 1 of C
+" add x21,x20,x10                            \n\t" //Load address Column 2 of C
+" add x22,x21,x10                            \n\t" //Load address Column 3 of C
+" add x23,x22,x10                            \n\t" //Load address Column 4 of C
+" add x24,x23,x10                            \n\t" //Load address Column 5 of C
+" add x25,x24,x10                            \n\t" //Load address Column 6 of C
+" add x26,x25,x10                            \n\t" //Load address Column 7 of C
 "                                            \n\t"
-" movi v12.2d,#0                             \n\t" // Vector for accummulating column 0 
-" movi v13.2d,#0                             \n\t" // Vector for accummulating column 0
-" movi v14.2d,#0                             \n\t" // Vector for accummulating column 1
-" movi v15.2d,#0                             \n\t" // Vector for accummulating column 1
-" movi v16.2d,#0                             \n\t" // Vector for accummulating column 2 
-" movi v17.2d,#0                             \n\t" // Vector for accummulating column 2
-" movi v18.2d,#0                             \n\t" // Vector for accummulating column 3
-" movi v19.2d,#0                             \n\t" // Vector for accummulating column 3
+" prfm pldl1keep,[x2]                        \n\t" // Prefetch c.
+" prfm pldl1keep,[x20]                       \n\t" // Prefetch c.
+" prfm pldl1keep,[x21]                       \n\t" // Prefetch c.
+" prfm pldl1keep,[x22]                       \n\t" // Prefetch c.
+" prfm pldl1keep,[x23]                       \n\t" // Prefetch c.
+" prfm pldl1keep,[x24]                       \n\t" // Prefetch c.
+" prfm pldl1keep,[x25]                       \n\t" // Prefetch c.
+" prfm pldl1keep,[x26]                       \n\t" // Prefetch c.
 "                                            \n\t"
-" movi v20.2d,#0                             \n\t" // Vector for accummulating column 0 
-" movi v21.2d,#0                             \n\t" // Vector for accummulating column 0
-" movi v22.2d,#0                             \n\t" // Vector for accummulating column 1
-" movi v23.2d,#0                             \n\t" // Vector for accummulating column 1
-" movi v24.2d,#0                             \n\t" // Vector for accummulating column 2 
-" movi v25.2d,#0                             \n\t" // Vector for accummulating column 2
-" movi v26.2d,#0                             \n\t" // Vector for accummulating column 3
-" movi v27.2d,#0                             \n\t" // Vector for accummulating column 3
+" ldr q0, [x0]                               \n\t"
+" ldr q1, [x0, #16]                          \n\t" // Load a
+" ldr q2, [x0, #32]                          \n\t"
 "                                            \n\t"
-" ld1r {v31.2d},[x8]                         \n\t" // Load beta
+" ldr q3, [x1]                               \n\t" // Load b
+" ldr q4, [x1, #16]                          \n\t"
+" ldr q5, [x1, #32]                          \n\t"
+" ldr q6, [x1, #48]                          \n\t"
+"                                            \n\t"
+" dup  v8.2d, xzr                            \n\t" // Vector for accummulating column 0
+" prfm    PLDL1KEEP, [x1, #256]              \n\t" 
+" dup  v9.2d, xzr                            \n\t" // Vector for accummulating column 0
+" prfm    PLDL1KEEP, [x1, #320]              \n\t"
+" dup  v10.2d, xzr                           \n\t" // Vector for accummulating column 0
+" prfm    PLDL1KEEP, [x1, #384]              \n\t"
+" dup  v11.2d, xzr                           \n\t" // Vector for accummulating column 1
+" prfm    PLDL1KEEP, [x1, #448]              \n\t"
+" dup  v12.2d, xzr                           \n\t" // Vector for accummulating column 1 
+" dup  v13.2d, xzr                           \n\t" // Vector for accummulating column 1
+"                                            \n\t"
+" dup  v14.2d, xzr                           \n\t" // Vector for accummulating column 2
+" prfm    PLDL1KEEP, [x0, #192]              \n\t"
+" dup  v15.2d, xzr                           \n\t" // Vector for accummulating column 2
+" prfm    PLDL1KEEP, [x0, #256]              \n\t"
+" dup  v16.2d, xzr                           \n\t" // Vector for accummulating column 2
+" prfm    PLDL1KEEP, [x0, #320]              \n\t"
+" dup  v17.2d, xzr                           \n\t" // Vector for accummulating column 3
+" dup  v18.2d, xzr                           \n\t" // Vector for accummulating column 3 
+" dup  v19.2d, xzr                           \n\t" // Vector for accummulating column 3
+"                                            \n\t"
+" dup  v20.2d, xzr                           \n\t" // Vector for accummulating column 4 
+" dup  v21.2d, xzr                           \n\t" // Vector for accummulating column 4
+" dup  v22.2d, xzr                           \n\t" // Vector for accummulating column 4
+" dup  v23.2d, xzr                           \n\t" // Vector for accummulating column 5
+" dup  v24.2d, xzr                           \n\t" // Vector for accummulating column 5 
+" dup  v25.2d, xzr                           \n\t" // Vector for accummulating column 5
+"                                            \n\t"
+" dup  v26.2d, xzr                           \n\t" // Vector for accummulating column 6 
+" dup  v27.2d, xzr                           \n\t" // Vector for accummulating column 6
+" dup  v28.2d, xzr                           \n\t" // Vector for accummulating column 6
+" dup  v29.2d, xzr                           \n\t" // Vector for accummulating column 7
+" dup  v30.2d, xzr                           \n\t" // Vector for accummulating column 7 
+" dup  v31.2d, xzr                           \n\t" // Vector for accummulating column 7
 "                                            \n\t"
 "                                            \n\t"
 " cmp x5,#0                                  \n\t" // If k_iter == 0, jump to k_left.
 " beq .DCONSIDERKLEFT                        \n\t"
 "                                            \n\t"
-" ldp q0,q1,[x0],32                          \n\t" // Load a
-" ldp q4,q5,[x1],32                          \n\t" // Load b
+"add x0, x0, #48                             \n\t" //update address of A
+"add x1, x1, #64                             \n\t" //update address of B
 "                                            \n\t"
 " cmp x5,1                                   \n\t" // If there is just one k_iter, jump to that one. 
 " beq .DLASTITER                             \n\t" // (as loop is do-while-like).
-"                                            \n\t" 
 "                                            \n\t"
 " DLOOP:                                     \n\t" // Body
 "                                            \n\t"
-" prfm pldl1keep,[x0,#1024]                  \n\t" // Prefetch.    
-" prfm pldl1keep,[x1,#1024]                  \n\t" // Prefetch.    
+" fmla v8.2d ,v0.2d,v3.d[0]                  \n\t" // Accummulate
+" prfm    PLDL1KEEP, [x1, #448]              \n\t" //512-64=448
+" fmla v9.2d ,v1.2d,v3.d[0]                  \n\t" // Accummulate
+" prfm    PLDL1KEEP, [x1, #512]              \n\t"
+" fmla v10.2d,v2.2d,v3.d[0]                  \n\t" // Accummulate
+" prfm    PLDL1KEEP, [x1, #576]              \n\t"
 "                                            \n\t"
-" ldp q6,q7,[x1],32                          \n\t" // Load b+4  into quad
+" fmla v11.2d,v0.2d,v3.d[1]                  \n\t" // Accummulate
+" fmla v12.2d,v1.2d,v3.d[1]                  \n\t" // Accummulate
+" fmla v13.2d,v2.2d,v3.d[1]                  \n\t" // Accummulate
 "                                            \n\t"
-" fmla v12.2d,v0.2d,v4.d[0]                  \n\t" // Accummulate
-" fmla v14.2d,v0.2d,v4.d[1]                  \n\t" // Accummulate
+" fmla v14.2d,v0.2d,v4.d[0]                  \n\t" // Accummulate
+" fmla v15.2d,v1.2d,v4.d[0]                  \n\t" // Accummulate
+" fmla v16.2d,v2.2d,v4.d[0]                  \n\t" // Accummulate
+" ldr q3, [x1]                               \n\t"
 "                                            \n\t"
-" fmla v13.2d,v1.2d,v4.d[0]                  \n\t" // Accummulate
-" fmla v15.2d,v1.2d,v4.d[1]                  \n\t" // Accummulate
+" fmla v17.2d,v0.2d,v4.d[1]                  \n\t" // Accummulate
+" fmla v18.2d,v1.2d,v4.d[1]                  \n\t" // Accummulate
+" fmla v19.2d,v2.2d,v4.d[1]                  \n\t" // Accummulate
+" ldr q7, [x0, #32]                          \n\t"
 "                                            \n\t"
-" ldp q2,q3,[x0],32                          \n\t" // Load a+4  into quad
+" fmla v20.2d,v0.2d,v5.d[0]                  \n\t" // Accummulate
+" fmla v21.2d,v1.2d,v5.d[0]                  \n\t" // Accummulate
+" fmla v22.2d,v2.2d,v5.d[0]                  \n\t" // Accummulate
+" ldr q4, [x1, #16]                          \n\t"
 "                                            \n\t"
-" fmla v16.2d,v0.2d,v5.d[0]                  \n\t" // Accummulate
-" fmla v18.2d,v0.2d,v5.d[1]                  \n\t" // Accummulate
+" fmla v23.2d,v0.2d,v5.d[1]                  \n\t" // Accummulate
+" fmla v24.2d,v1.2d,v5.d[1]                  \n\t" // Accummulate
+" fmla v25.2d,v2.2d,v5.d[1]                  \n\t" // Accummulate
+" ldr q5, [x1, #32]                          \n\t"
 "                                            \n\t"
-" fmla v17.2d,v1.2d,v5.d[0]                  \n\t" // Accummulate
-" fmla v19.2d,v1.2d,v5.d[1]                  \n\t" // Accummulate
+" fmla v26.2d,v0.2d,v6.d[0]                  \n\t" // Accummulate
+" fmla v29.2d,v0.2d,v6.d[1]                  \n\t" // Accummulate
+" ldr q0, [x0]                               \n\t"
 "                                            \n\t"
-" ldp q0,q1,[x0],32                          \n\t" // Load a    into quad
+" fmla v27.2d,v1.2d,v6.d[0]                  \n\t" // Accummulate
+" fmla v30.2d,v1.2d,v6.d[1]                  \n\t" // Accummulate
+" ldr q1, [x0, #16]                          \n\t"
 "                                            \n\t"
-" fmla v12.2d,v2.2d,v6.d[0]                  \n\t" // Accummulate
-" fmla v14.2d,v2.2d,v6.d[1]                  \n\t" // Accummulate
+" fmla v28.2d,v2.2d,v6.d[0]                  \n\t" // Accummulate
+" fmla v31.2d,v2.2d,v6.d[1]                  \n\t" // Accummulate
+" ldr q6, [x1, #48]                          \n\t"
+"                                            \n\t"                  // End it 1
+" fmla v8.2d ,v0.2d,v3.d[0]                  \n\t" // Accummulate
+" prfm    PLDL1KEEP, [x1, #640]              \n\t"
+" fmla v9.2d ,v1.2d,v3.d[0]                  \n\t" // Accummulate
+" prfm    PLDL1KEEP, [x0, #336]              \n\t"
+" fmla v10.2d,v7.2d,v3.d[0]                  \n\t" // Accummulate
+" prfm    PLDL1KEEP, [x0, #400]              \n\t"
 "                                            \n\t"
-" fmla v13.2d,v3.2d,v6.d[0]                  \n\t" // Accummulate
-" fmla v15.2d,v3.2d,v6.d[1]                  \n\t" // Accummulate
+" fmla v11.2d,v0.2d,v3.d[1]                  \n\t" // Accummulate
+" fmla v12.2d,v1.2d,v3.d[1]                  \n\t" // Accummulate
+" fmla v13.2d,v7.2d,v3.d[1]                  \n\t" // Accummulate
 "                                            \n\t"
-" ldp q4,q5,[x1],32                          \n\t" // Load b    into quad
+" fmla v14.2d,v0.2d,v4.d[0]                  \n\t" // Accummulate
+" fmla v15.2d,v1.2d,v4.d[0]                  \n\t" // Accummulate
+" fmla v16.2d,v7.2d,v4.d[0]                  \n\t" // Accummulate
+" ldr q3, [x1, #64]                          \n\t"
 "                                            \n\t"
-" fmla v16.2d,v2.2d,v7.d[0]                  \n\t" // Accummulate
-" fmla v18.2d,v2.2d,v7.d[1]                  \n\t" // Accummulate
+" fmla v17.2d,v0.2d,v4.d[1]                  \n\t" // Accummulate
+" fmla v18.2d,v1.2d,v4.d[1]                  \n\t" // Accummulate
+" fmla v19.2d,v7.2d,v4.d[1]                  \n\t" // Accummulate
+" ldr q2, [x0, #80]                          \n\t"
 "                                            \n\t"
-" fmla v17.2d,v3.2d,v7.d[0]                  \n\t" // Accummulate
-" fmla v19.2d,v3.2d,v7.d[1]                  \n\t" // Accummulate
+" fmla v20.2d,v0.2d,v5.d[0]                  \n\t" // Accummulate
+" fmla v21.2d,v1.2d,v5.d[0]                  \n\t" // Accummulate
+" fmla v22.2d,v7.2d,v5.d[0]                  \n\t" // Accummulate
+" ldr q4, [x1, #80]                          \n\t"
 "                                            \n\t"
-" prfm pldl1keep,[x0,#64]                    \n\t" // Prefetch.  
-" prfm pldl1keep,[x1,#64]                    \n\t" // Prefetch.   
+" fmla v23.2d,v0.2d,v5.d[1]                  \n\t" // Accummulate
+" fmla v24.2d,v1.2d,v5.d[1]                  \n\t" // Accummulate
+" fmla v25.2d,v7.2d,v5.d[1]                  \n\t" // Accummulate
+" ldr q5, [x1, #96]                          \n\t"
+"                                            \n\t"
+" fmla v26.2d,v0.2d,v6.d[0]                  \n\t" // Accummulate
+" fmla v29.2d,v0.2d,v6.d[1]                  \n\t" // Accummulate
+" ldr q0, [x0, #48]                          \n\t"
+"                                            \n\t"
+" fmla v27.2d,v1.2d,v6.d[0]                  \n\t" // Accummulate
+" fmla v30.2d,v1.2d,v6.d[1]                  \n\t" // Accummulate
+" ldr q1, [x0, #64]                          \n\t"
+"                                            \n\t"
+" fmla v28.2d,v7.2d,v6.d[0]                  \n\t" // Accummulate
+" fmla v31.2d,v7.2d,v6.d[1]                  \n\t" // Accummulate
+" ldr q6, [x1, #112]                         \n\t"
+"                                            \n\t"                  //End it 2
+" fmla v8.2d ,v0.2d,v3.d[0]                  \n\t" // Accummulate
+" prfm    PLDL1KEEP, [x0, #464]              \n\t"
+" fmla v9.2d ,v1.2d,v3.d[0]                  \n\t" // Accummulate
+" fmla v10.2d,v2.2d,v3.d[0]                  \n\t" // Accummulate
+"                                            \n\t"
+" fmla v11.2d,v0.2d,v3.d[1]                  \n\t" // Accummulate
+" fmla v12.2d,v1.2d,v3.d[1]                  \n\t" // Accummulate
+" fmla v13.2d,v2.2d,v3.d[1]                  \n\t" // Accummulate
+"                                            \n\t"
+" fmla v14.2d,v0.2d,v4.d[0]                  \n\t" // Accummulate
+" fmla v15.2d,v1.2d,v4.d[0]                  \n\t" // Accummulate
+" fmla v16.2d,v2.2d,v4.d[0]                  \n\t" // Accummulate
+" ldr q3, [x1, #128]                         \n\t"
+"                                            \n\t"
+" fmla v17.2d,v0.2d,v4.d[1]                  \n\t" // Accummulate
+" fmla v18.2d,v1.2d,v4.d[1]                  \n\t" // Accummulate
+" fmla v19.2d,v2.2d,v4.d[1]                  \n\t" // Accummulate
+" ldr q7, [x0, #128]                         \n\t"
+"                                            \n\t"
+" fmla v20.2d,v0.2d,v5.d[0]                  \n\t" // Accummulate
+" fmla v21.2d,v1.2d,v5.d[0]                  \n\t" // Accummulate
+" fmla v22.2d,v2.2d,v5.d[0]                  \n\t" // Accummulate
+" ldr q4, [x1, #144]                         \n\t"
+"                                            \n\t"
+" fmla v23.2d,v0.2d,v5.d[1]                  \n\t" // Accummulate
+" fmla v24.2d,v1.2d,v5.d[1]                  \n\t" // Accummulate
+" fmla v25.2d,v2.2d,v5.d[1]                  \n\t" // Accummulate
+" ldr q5, [x1, #160]                         \n\t"
+"                                            \n\t"
+" fmla v26.2d,v0.2d,v6.d[0]                  \n\t" // Accummulate
+" fmla v29.2d,v0.2d,v6.d[1]                  \n\t" // Accummulate
+" ldr q0, [x0, #96]                          \n\t"
+"                                            \n\t"
+" fmla v27.2d,v1.2d,v6.d[0]                  \n\t" // Accummulate
+" fmla v30.2d,v1.2d,v6.d[1]                  \n\t" // Accummulate
+" ldr q1, [x0, #112]                         \n\t"
+"                                            \n\t"
+" fmla v28.2d,v2.2d,v6.d[0]                  \n\t" // Accummulate
+" fmla v31.2d,v2.2d,v6.d[1]                  \n\t" // Accummulate
+" ldr q6, [x1, #176]                         \n\t"
+"                                            \n\t"                  // End it 3
+" fmla v8.2d ,v0.2d,v3.d[0]                  \n\t" // Accummulate
+" fmla v9.2d ,v1.2d,v3.d[0]                  \n\t" // Accummulate
+" fmla v10.2d,v7.2d,v3.d[0]                  \n\t" // Accummulate
+"                                            \n\t"
+" fmla v11.2d,v0.2d,v3.d[1]                  \n\t" // Accummulate
+" fmla v12.2d,v1.2d,v3.d[1]                  \n\t" // Accummulate
+" fmla v13.2d,v7.2d,v3.d[1]                  \n\t" // Accummulate
+" ldr q3, [x1, #192]                         \n\t"
+"                                            \n\t"
+" fmla v14.2d,v0.2d,v4.d[0]                  \n\t" // Accummulate
+" fmla v15.2d,v1.2d,v4.d[0]                  \n\t" // Accummulate
+" fmla v16.2d,v7.2d,v4.d[0]                  \n\t" // Accummulate
+" ldr q2, [x0, #176]                         \n\t"
+"                                            \n\t"
+" fmla v17.2d,v0.2d,v4.d[1]                  \n\t" // Accummulate
+" fmla v18.2d,v1.2d,v4.d[1]                  \n\t" // Accummulate
+" fmla v19.2d,v7.2d,v4.d[1]                  \n\t" // Accummulate
+" ldr q4, [x1, #208]                         \n\t"
+"                                            \n\t"
+" fmla v20.2d,v0.2d,v5.d[0]                  \n\t" // Accummulate
+" fmla v21.2d,v1.2d,v5.d[0]                  \n\t" // Accummulate
+" fmla v22.2d,v7.2d,v5.d[0]                  \n\t" // Accummulate
+"                                            \n\t"
+" fmla v23.2d,v0.2d,v5.d[1]                  \n\t" // Accummulate
+" fmla v24.2d,v1.2d,v5.d[1]                  \n\t" // Accummulate
+" fmla v25.2d,v7.2d,v5.d[1]                  \n\t" // Accummulate
+" ldr q5, [x1, #224]                         \n\t"
+"                                            \n\t"
+" fmla v26.2d,v0.2d,v6.d[0]                  \n\t" // Accummulate
+" fmla v29.2d,v0.2d,v6.d[1]                  \n\t" // Accummulate
+" ldr q0, [x0, #144]                         \n\t"
+"                                            \n\t"
+" fmla v27.2d,v1.2d,v6.d[0]                  \n\t" // Accummulate
+" fmla v30.2d,v1.2d,v6.d[1]                  \n\t" // Accummulate
+" ldr q1, [x0, #160]                         \n\t"
+"                                            \n\t"
+" fmla v28.2d,v7.2d,v6.d[0]                  \n\t" // Accummulate
+" fmla v31.2d,v7.2d,v6.d[1]                  \n\t" // Accummulate
+" ldr q6, [x1, #240]                         \n\t"
+"                                            \n\t"                  //End it 4
+" add x0, x0, #192                           \n\t"
+" add x1, x1, #256                           \n\t"
 "                                            \n\t"
 " sub x5,x5,1                                \n\t" // i-=1
 " cmp x5,1                                   \n\t" // Iterate again if we are not in k_iter == 1.
-"bne DLOOP                                   \n\t"
+" bne DLOOP                                  \n\t"
 "                                            \n\t"
 ".DLASTITER:                                 \n\t"
 "                                            \n\t"
-" ldp q6,q7,[x1],32                          \n\t" // Load b+4  into quad
+" fmla v8.2d ,v0.2d,v3.d[0]                  \n\t" // Accummulate
+" fmla v9.2d ,v1.2d,v3.d[0]                  \n\t" // Accummulate
+" fmla v10.2d,v2.2d,v3.d[0]                  \n\t" // Accummulate
 "                                            \n\t"
-" fmla v12.2d,v0.2d,v4.d[0]                  \n\t" // Accummulate
-" fmla v14.2d,v0.2d,v4.d[1]                  \n\t" // Accummulate
+" fmla v11.2d,v0.2d,v3.d[1]                  \n\t" // Accummulate
+" fmla v12.2d,v1.2d,v3.d[1]                  \n\t" // Accummulate
+" fmla v13.2d,v2.2d,v3.d[1]                  \n\t" // Accummulate
+" ldr q3, [x1]                               \n\t"
 "                                            \n\t"
-" fmla v13.2d,v1.2d,v4.d[0]                  \n\t" // Accummulate
-" fmla v15.2d,v1.2d,v4.d[1]                  \n\t" // Accummulate
+" fmla v14.2d,v0.2d,v4.d[0]                  \n\t" // Accummulate
+" fmla v15.2d,v1.2d,v4.d[0]                  \n\t" // Accummulate
+" fmla v16.2d,v2.2d,v4.d[0]                  \n\t" // Accummulate
+" ldr q7, [x0, #32]                          \n\t"
 "                                            \n\t"
-" ldp q2,q3,[x0],32                          \n\t" // Load a+4  into quad
+" fmla v17.2d,v0.2d,v4.d[1]                  \n\t" // Accummulate
+" fmla v18.2d,v1.2d,v4.d[1]                  \n\t" // Accummulate
+" fmla v19.2d,v2.2d,v4.d[1]                  \n\t" // Accummulate
+" ldr q4, [x1, #16]                          \n\t"
 "                                            \n\t"
-" fmla v16.2d,v0.2d,v5.d[0]                  \n\t" // Accummulate
-" fmla v18.2d,v0.2d,v5.d[1]                  \n\t" // Accummulate
+" fmla v20.2d,v0.2d,v5.d[0]                  \n\t" // Accummulate
+" fmla v21.2d,v1.2d,v5.d[0]                  \n\t" // Accummulate
+" fmla v22.2d,v2.2d,v5.d[0]                  \n\t" // Accummulate
 "                                            \n\t"
-" fmla v17.2d,v1.2d,v5.d[0]                  \n\t" // Accummulate
-" fmla v19.2d,v1.2d,v5.d[1]                  \n\t" // Accummulate
+" fmla v23.2d,v0.2d,v5.d[1]                  \n\t" // Accummulate
+" fmla v24.2d,v1.2d,v5.d[1]                  \n\t" // Accummulate
+" fmla v25.2d,v2.2d,v5.d[1]                  \n\t" // Accummulate
+" ldr q5, [x1, #32]                          \n\t"
 "                                            \n\t"
-" ld1r {v30.2d},[x7]                         \n\t" // Load alpha.
+" fmla v26.2d,v0.2d,v6.d[0]                  \n\t" // Accummulate
+" fmla v29.2d,v0.2d,v6.d[1]                  \n\t" // Accummulate
+" ldr q0, [x0]                               \n\t"
 "                                            \n\t"
-" fmla v12.2d,v2.2d,v6.d[0]                  \n\t" // Accummulate
-" fmla v14.2d,v2.2d,v6.d[1]                  \n\t" // Accummulate
+" fmla v27.2d,v1.2d,v6.d[0]                  \n\t" // Accummulate
+" fmla v30.2d,v1.2d,v6.d[1]                  \n\t" // Accummulate
+" ldr q1, [x0, #16]                          \n\t"
 "                                            \n\t"
-" fmla v13.2d,v3.2d,v6.d[0]                  \n\t" // Accummulate
-" fmla v15.2d,v3.2d,v6.d[1]                  \n\t" // Accummulate
+" fmla v28.2d,v2.2d,v6.d[0]                  \n\t" // Accummulate
+" fmla v31.2d,v2.2d,v6.d[1]                  \n\t" // Accummulate
+" ldr q6, [x1, #48]                          \n\t"
+"                                            \n\t"                  // End it 1
+" fmla v8.2d ,v0.2d,v3.d[0]                  \n\t" // Accummulate
+" fmla v9.2d ,v1.2d,v3.d[0]                  \n\t" // Accummulate
+" fmla v10.2d,v7.2d,v3.d[0]                  \n\t" // Accummulate
 "                                            \n\t"
-" fmla v16.2d,v2.2d,v7.d[0]                  \n\t" // Accummulate
-" fmla v18.2d,v2.2d,v7.d[1]                  \n\t" // Accummulate
+" fmla v11.2d,v0.2d,v3.d[1]                  \n\t" // Accummulate
+" fmla v12.2d,v1.2d,v3.d[1]                  \n\t" // Accummulate
+" fmla v13.2d,v7.2d,v3.d[1]                  \n\t" // Accummulate
+" ldr q3, [x1, #64]                          \n\t"
 "                                            \n\t"
-" fmla v17.2d,v3.2d,v7.d[0]                  \n\t" // Accummulate
-" fmla v19.2d,v3.2d,v7.d[1]                  \n\t" // Accummulate
+" fmla v14.2d,v0.2d,v4.d[0]                  \n\t" // Accummulate
+" fmla v15.2d,v1.2d,v4.d[0]                  \n\t" // Accummulate
+" fmla v16.2d,v7.2d,v4.d[0]                  \n\t" // Accummulate
+" ldr q2, [x0, #80]                          \n\t"
+"                                            \n\t"
+" fmla v17.2d,v0.2d,v4.d[1]                  \n\t" // Accummulate
+" fmla v18.2d,v1.2d,v4.d[1]                  \n\t" // Accummulate
+" fmla v19.2d,v7.2d,v4.d[1]                  \n\t" // Accummulate
+" ldr q4, [x1, #80]                          \n\t"
+"                                            \n\t"
+" fmla v20.2d,v0.2d,v5.d[0]                  \n\t" // Accummulate
+" fmla v21.2d,v1.2d,v5.d[0]                  \n\t" // Accummulate
+" fmla v22.2d,v7.2d,v5.d[0]                  \n\t" // Accummulate
+"                                            \n\t"
+" fmla v23.2d,v0.2d,v5.d[1]                  \n\t" // Accummulate
+" fmla v24.2d,v1.2d,v5.d[1]                  \n\t" // Accummulate
+" fmla v25.2d,v7.2d,v5.d[1]                  \n\t" // Accummulate
+" ldr q5, [x1, #96]                          \n\t"
+"                                            \n\t"
+" fmla v26.2d,v0.2d,v6.d[0]                  \n\t" // Accummulate
+" fmla v29.2d,v0.2d,v6.d[1]                  \n\t" // Accummulate
+" ldr q0, [x0, #48]                          \n\t"
+"                                            \n\t"
+" fmla v27.2d,v1.2d,v6.d[0]                  \n\t" // Accummulate
+" fmla v30.2d,v1.2d,v6.d[1]                  \n\t" // Accummulate
+" ldr q1, [x0, #64]                          \n\t"
+"                                            \n\t"
+" fmla v28.2d,v7.2d,v6.d[0]                  \n\t" // Accummulate
+" fmla v31.2d,v7.2d,v6.d[1]                  \n\t" // Accummulate
+" ldr q6, [x1, #112]                         \n\t"
+"                                            \n\t"                  //End it 2
+" fmla v8.2d ,v0.2d,v3.d[0]                  \n\t" // Accummulate
+" fmla v9.2d ,v1.2d,v3.d[0]                  \n\t" // Accummulate
+" fmla v10.2d,v2.2d,v3.d[0]                  \n\t" // Accummulate
+"                                            \n\t"
+" fmla v11.2d,v0.2d,v3.d[1]                  \n\t" // Accummulate
+" fmla v12.2d,v1.2d,v3.d[1]                  \n\t" // Accummulate
+" fmla v13.2d,v2.2d,v3.d[1]                  \n\t" // Accummulate
+" ldr q3, [x1, #128]                         \n\t"
+"                                            \n\t"
+" fmla v14.2d,v0.2d,v4.d[0]                  \n\t" // Accummulate
+" fmla v15.2d,v1.2d,v4.d[0]                  \n\t" // Accummulate
+" fmla v16.2d,v2.2d,v4.d[0]                  \n\t" // Accummulate
+" ldr q7, [x0, #128]                         \n\t"
+"                                            \n\t"
+" fmla v17.2d,v0.2d,v4.d[1]                  \n\t" // Accummulate
+" fmla v18.2d,v1.2d,v4.d[1]                  \n\t" // Accummulate
+" fmla v19.2d,v2.2d,v4.d[1]                  \n\t" // Accummulate
+" ldr q4, [x1, #144]                         \n\t"
+"                                            \n\t"
+" fmla v20.2d,v0.2d,v5.d[0]                  \n\t" // Accummulate
+" fmla v21.2d,v1.2d,v5.d[0]                  \n\t" // Accummulate
+" fmla v22.2d,v2.2d,v5.d[0]                  \n\t" // Accummulate
+"                                            \n\t"
+" fmla v23.2d,v0.2d,v5.d[1]                  \n\t" // Accummulate
+" fmla v24.2d,v1.2d,v5.d[1]                  \n\t" // Accummulate
+" fmla v25.2d,v2.2d,v5.d[1]                  \n\t" // Accummulate
+" ldr q5, [x1, #160]                         \n\t"
+"                                            \n\t"
+" fmla v26.2d,v0.2d,v6.d[0]                  \n\t" // Accummulate
+" fmla v29.2d,v0.2d,v6.d[1]                  \n\t" // Accummulate
+" ldr q0, [x0, #96]                          \n\t"
+"                                            \n\t"
+" fmla v27.2d,v1.2d,v6.d[0]                  \n\t" // Accummulate
+" fmla v30.2d,v1.2d,v6.d[1]                  \n\t" // Accummulate
+" ldr q1, [x0, #112]                         \n\t"
+"                                            \n\t"
+" fmla v28.2d,v2.2d,v6.d[0]                  \n\t" // Accummulate
+" fmla v31.2d,v2.2d,v6.d[1]                  \n\t" // Accummulate
+" ldr q6, [x1, #176]                         \n\t"
+"                                            \n\t"                  // End it 3
+" fmla v8.2d ,v0.2d,v3.d[0]                  \n\t" // Accummulate
+" fmla v9.2d ,v1.2d,v3.d[0]                  \n\t" // Accummulate
+" fmla v10.2d,v7.2d,v3.d[0]                  \n\t" // Accummulate
+"                                            \n\t"
+" fmla v11.2d,v0.2d,v3.d[1]                  \n\t" // Accummulate
+" fmla v12.2d,v1.2d,v3.d[1]                  \n\t" // Accummulate
+" fmla v13.2d,v7.2d,v3.d[1]                  \n\t" // Accummulate
+"                                            \n\t"
+" fmla v14.2d,v0.2d,v4.d[0]                  \n\t" // Accummulate
+" fmla v15.2d,v1.2d,v4.d[0]                  \n\t" // Accummulate
+" fmla v16.2d,v7.2d,v4.d[0]                  \n\t" // Accummulate
+"                                            \n\t"
+" fmla v17.2d,v0.2d,v4.d[1]                  \n\t" // Accummulate
+" fmla v18.2d,v1.2d,v4.d[1]                  \n\t" // Accummulate
+" fmla v19.2d,v7.2d,v4.d[1]                  \n\t" // Accummulate
+"                                            \n\t"
+" fmla v20.2d,v0.2d,v5.d[0]                  \n\t" // Accummulate
+" fmla v21.2d,v1.2d,v5.d[0]                  \n\t" // Accummulate
+" fmla v22.2d,v7.2d,v5.d[0]                  \n\t" // Accummulate
+"                                            \n\t"
+" fmla v23.2d,v0.2d,v5.d[1]                  \n\t" // Accummulate
+" fmla v24.2d,v1.2d,v5.d[1]                  \n\t" // Accummulate
+" fmla v25.2d,v7.2d,v5.d[1]                  \n\t" // Accummulate
+"                                            \n\t"
+" fmla v26.2d,v0.2d,v6.d[0]                  \n\t" // Accummulate
+" add x1, x1, #192                           \n\t"
+" fmla v29.2d,v0.2d,v6.d[1]                  \n\t" // Accummulate
+"                                            \n\t"
+" fmla v27.2d,v1.2d,v6.d[0]                  \n\t" // Accummulate
+" fmla v30.2d,v1.2d,v6.d[1]                  \n\t" // Accummulate
+"                                            \n\t"
+" fmla v28.2d,v7.2d,v6.d[0]                  \n\t" // Accummulate
+" fmla v31.2d,v7.2d,v6.d[1]                  \n\t" // Accummulate
+"                                            \n\t"                  //End it 4
+" add x0, x0, #144                           \n\t"
 "                                            \n\t"
 " .DCONSIDERKLEFT:                           \n\t" 
 " cmp x6,0                                   \n\t" // If k_left == 0, we are done.
@@ -591,182 +1548,488 @@ __asm__ volatile
 "                                            \n\t"
 ".DLOOPKLEFT:                                \n\t"
 "                                            \n\t"
-" prfm pldl1keep,[x0,#1024]                  \n\t" // Prefetch.
-" prfm pldl1keep,[x1,#1024]                  \n\t" // Prefetch.
+" ldr q0, [x0],#16                           \n\t"
+" ldr q1, [x0],#16                           \n\t" // Load a
+" ldr q2, [x0],#16                           \n\t"
 "                                            \n\t"
-" ldp q0,q1,[x0],32                          \n\t" // Load a    into quad
-" ldp q4,q5,[x1],32                          \n\t" // Load b    into quad
+" ldr q3, [x1],#16                           \n\t" // Load b
+" ldr q4, [x1],#16                           \n\t"
+" ldr q5, [x1],#16                           \n\t"
+" ldr q6, [x1],#16                           \n\t"
 "                                            \n\t"
-//" sub x6,x6,1                                \n\t"
+" sub x6,x6,1                                \n\t"
 "                                            \n\t"
-" fmla v12.2d,v0.2d,v4.d[0]                  \n\t" // Accummulate
-" fmla v14.2d,v0.2d,v4.d[1]                  \n\t" // Accummulate
+" fmla v8.2d ,v0.2d,v3.d[0]                  \n\t" // Accummulate
+" fmla v9.2d ,v1.2d,v3.d[0]                  \n\t" // Accummulate
+" fmla v10.2d,v2.2d,v3.d[0]                  \n\t" // Accummulate
 "                                            \n\t"
-" fmla v13.2d,v1.2d,v4.d[0]                  \n\t" // Accummulate
-" fmla v15.2d,v1.2d,v4.d[1]                  \n\t" // Accummulate
+" fmla v11.2d,v0.2d,v3.d[1]                  \n\t" // Accummulate
+" fmla v12.2d,v1.2d,v3.d[1]                  \n\t" // Accummulate
+" fmla v13.2d,v2.2d,v3.d[1]                  \n\t" // Accummulate
 "                                            \n\t"
-" fmla v16.2d,v0.2d,v5.d[0]                  \n\t" // Accummulate
-" fmla v18.2d,v0.2d,v5.d[1]                  \n\t" // Accummulate
+" fmla v14.2d,v0.2d,v4.d[0]                  \n\t" // Accummulate
+" fmla v15.2d,v1.2d,v4.d[0]                  \n\t" // Accummulate
+" fmla v16.2d,v2.2d,v4.d[0]                  \n\t" // Accummulate
 "                                            \n\t"
-" fmla v17.2d,v1.2d,v5.d[0]                  \n\t" // Accummulate
-" fmla v19.2d,v1.2d,v5.d[1]                  \n\t" // Accummulate
+" fmla v17.2d,v0.2d,v4.d[1]                  \n\t" // Accummulate
+" fmla v18.2d,v1.2d,v4.d[1]                  \n\t" // Accummulate
+" fmla v19.2d,v2.2d,v4.d[1]                  \n\t" // Accummulate
 "                                            \n\t"
-//" cmp x6,0                                   \n\t" // Iterate again.
-//" bne .DLOOPKLEFT                            \n\t" // if i!=0.
+" fmla v20.2d,v0.2d,v5.d[0]                  \n\t" // Accummulate
+" fmla v21.2d,v1.2d,v5.d[0]                  \n\t" // Accummulate
+" fmla v22.2d,v2.2d,v5.d[0]                  \n\t" // Accummulate
+"                                            \n\t"
+" fmla v23.2d,v0.2d,v5.d[1]                  \n\t" // Accummulate
+" fmla v24.2d,v1.2d,v5.d[1]                  \n\t" // Accummulate
+" fmla v25.2d,v2.2d,v5.d[1]                  \n\t" // Accummulate
+"                                            \n\t"
+" fmla v26.2d,v0.2d,v6.d[0]                  \n\t" // Accummulate
+" fmla v29.2d,v0.2d,v6.d[1]                  \n\t" // Accummulate
+"                                            \n\t"
+" fmla v27.2d,v1.2d,v6.d[0]                  \n\t" // Accummulate
+" fmla v30.2d,v1.2d,v6.d[1]                  \n\t" // Accummulate
+"                                            \n\t"
+" fmla v28.2d,v2.2d,v6.d[0]                  \n\t" // Accummulate
+" fmla v31.2d,v2.2d,v6.d[1]                  \n\t" // Accummulate
+"                                            \n\t"
+" cmp x6,0                                   \n\t" // Iterate again.
+" bne .DLOOPKLEFT                            \n\t" // if i!=0.
 "                                            \n\t"
 " .DPOSTACCUM:                               \n\t"
-" ld1r {v30.2d},[x7]                         \n\t" // Load alpha.
+"                                            \n\t"
+" ld1r {v6.2d},[x7]                          \n\t" // Load alpha.
+" ld1r {v7.2d},[x8]                          \n\t" // Load beta
 "                                            \n\t"
 " cmp x13,#1                                 \n\t" // If rs_c != 1 (column-major)
 " bne .DGENSTORED                            \n\t"
 "                                            \n\t"
 " .DCOLSTORED:                               \n\t" // C is column-major.
-" fcmp d31,#0.0                              \n\t"
-" beq .DBETAZEROCOLSTORED                    \n\t" // Taking care of the beta==0 case.
 "                                            \n\t"
-" ldr x2,%[caddr]                            \n\t" // Load address of C.
+" dup  v0.2d, xzr                            \n\t"
+" dup  v1.2d, xzr                            \n\t"
+" dup  v2.2d, xzr                            \n\t"
+" dup  v3.2d, xzr                            \n\t"
+" dup  v4.2d, xzr                            \n\t"
+" dup  v5.2d, xzr                            \n\t"
 "                                            \n\t"
+" fcmp d7,#0.0                               \n\t"
+" beq .DBETAZEROCOLSTOREDS1                  \n\t" // Taking care of the beta==0 case.
 "                                            \n\t"
-" ldp q0,q1,[x2]                             \n\t" // Load c    into quad and increment by cs_c
-" add x2,x2,x10                              \n\t"
-" ldp q2,q3,[x2]                             \n\t" // Load c    into quad and increment by cs_c
-" add x2,x2,x10                              \n\t"
-" ldp q4,q5,[x2]                             \n\t" // Load c    into quad and increment by cs_c
-" add x2,x2,x10                              \n\t"
-" ldp q6,q7,[x2]                             \n\t" // Load c    into quad and increment by cs_c
+" ldr q0, [x2]                               \n\t" //Load column 0 of C
+" ldr q1, [x2, #16]                          \n\t"
+" ldr q2, [x2, #32]                          \n\t"
 "                                            \n\t"
+" ldr q3, [x20]                              \n\t" //Load column 1 of C
+" ldr q4, [x20, #16]                         \n\t"
+" ldr q5, [x20, #32]                         \n\t"
 "                                            \n\t"
-" fmul v20.2d,v0.2d,v31.d[0]                 \n\t" // Scale by beta
-" fmul v21.2d,v1.2d,v31.d[0]                 \n\t" // Scale by beta
-" fmul v22.2d,v2.2d,v31.d[0]                 \n\t" // Scale by beta
-" fmul v23.2d,v3.2d,v31.d[0]                 \n\t" // Scale by beta
-" fmul v24.2d,v4.2d,v31.d[0]                 \n\t" // Scale by beta
-" fmul v25.2d,v5.2d,v31.d[0]                 \n\t" // Scale by beta
-" fmul v26.2d,v6.2d,v31.d[0]                 \n\t" // Scale by beta
-" fmul v27.2d,v7.2d,v31.d[0]                 \n\t" // Scale by beta
+" fmul v0.2d,v0.2d,v7.d[0]                   \n\t" // Scale by beta
+" fmul v1.2d,v1.2d,v7.d[0]                   \n\t" // Scale by beta
+" fmul v2.2d,v2.2d,v7.d[0]                   \n\t" // Scale by beta
+" fmul v3.2d,v3.2d,v7.d[0]                   \n\t" // Scale by beta
+" fmul v4.2d,v4.2d,v7.d[0]                   \n\t" // Scale by beta
+" fmul v5.2d,v5.2d,v7.d[0]                   \n\t" // Scale by beta
 "                                            \n\t"
-" prfm pldl2keep,[x16]                       \n\t"
-" prfm pldl2keep,[x17]                       \n\t"
+" .DBETAZEROCOLSTOREDS1:                     \n\t"
 "                                            \n\t"
-" .DBETAZEROCOLSTORED:                       \n\t" // If beta==0, we won't read from C (nor scale).
+" fmla v0.2d,v8.2d,v6.d[0]                   \n\t" // Scale by alpha
+" fmla v1.2d,v9.2d,v6.d[0]                   \n\t" // Scale by alpha
+" fmla v2.2d,v10.2d,v6.d[0]                  \n\t" // Scale by alpha
+" fmla v3.2d,v11.2d,v6.d[0]                  \n\t" // Scale by alpha
+" fmla v4.2d,v12.2d,v6.d[0]                  \n\t" // Scale by alpha
+" fmla v5.2d,v13.2d,v6.d[0]                  \n\t" // Scale by alpha
 "                                            \n\t"
-" ldr x2,%[caddr]                            \n\t" // Load address of C
+" str q0, [x2]                               \n\t" //Store column 0 of C
+" str q1, [x2, #16]                          \n\t"
+" str q2, [x2, #32]                          \n\t"
 "                                            \n\t"
-" fmla v20.2d,v12.2d,v30.d[0]                \n\t" // Scale by alpha
-" fmla v21.2d,v13.2d,v30.d[0]                \n\t" // Scale by alpha
-" fmla v22.2d,v14.2d,v30.d[0]                \n\t" // Scale by alpha
-" fmla v23.2d,v15.2d,v30.d[0]                \n\t" // Scale by alpha
-" fmla v24.2d,v16.2d,v30.d[0]                \n\t" // Scale by alpha
-" fmla v25.2d,v17.2d,v30.d[0]                \n\t" // Scale by alpha
-" fmla v26.2d,v18.2d,v30.d[0]                \n\t" // Scale by alpha
-" fmla v27.2d,v19.2d,v30.d[0]                \n\t" // Scale by alpha
+" str q3, [x20]                              \n\t" //Store column 1 of C
+" str q4, [x20, #16]                         \n\t"
+" str q5, [x20, #32]                         \n\t"
 "                                            \n\t"
-" stp q20,q21,[x2]                           \n\t" // Store quad into c    and increment by cs_c
-" add x2,x2,x10                              \n\t"
-" stp q22,q23,[x2]                           \n\t" // Store quad into c+4  and increment by cs_c
-" add x2,x2,x10                              \n\t"
-" stp q24,q25,[x2]                           \n\t" // Store quad into c+8  and increment by cs_c
-" add x2,x2,x10                              \n\t"
-" stp q26,q27,[x2]                           \n\t" // Store quad into c+16 and increment by cs_c
+" dup  v8.2d, xzr                            \n\t"
+" dup  v9.2d, xzr                            \n\t"
+" dup  v10.2d, xzr                           \n\t"
+" dup  v11.2d, xzr                           \n\t"
+" dup  v12.2d, xzr                           \n\t"
+" dup  v13.2d, xzr                           \n\t"
 "                                            \n\t"
-" b .DEND                                    \n\t" // Done (TODO: this obviously needs to be moved down to remove jump).
+" fcmp d7,#0.0                               \n\t"
+" beq .DBETAZEROCOLSTOREDS2                  \n\t" // Taking care of the beta==0 case.
+"                                            \n\t"
+" ldr q8, [x21]                              \n\t" //Load column 2 of C
+" ldr q9, [x21, #16]                         \n\t"
+" ldr q10, [x21, #32]                        \n\t"
+"                                            \n\t"
+" ldr q11, [x22]                             \n\t" //Load column 3 of C
+" ldr q12, [x22, #16]                        \n\t"
+" ldr q13, [x22, #32]                        \n\t"
+"                                            \n\t"
+" fmul v8.2d, v8.2d, v7.d[0]                 \n\t" // Scale by beta
+" fmul v9.2d, v9.2d, v7.d[0]                 \n\t" // Scale by beta
+" fmul v10.2d,v10.2d,v7.d[0]                 \n\t" // Scale by beta
+" fmul v11.2d,v11.2d,v7.d[0]                 \n\t" // Scale by beta
+" fmul v12.2d,v12.2d,v7.d[0]                 \n\t" // Scale by beta
+" fmul v13.2d,v13.2d,v7.d[0]                 \n\t" // Scale by beta
+"                                            \n\t"
+" .DBETAZEROCOLSTOREDS2:                     \n\t"
+"                                            \n\t"
+" fmla v8.2d, v14.2d,v6.d[0]                 \n\t" // Scale by alpha
+" fmla v9.2d, v15.2d,v6.d[0]                 \n\t" // Scale by alpha
+" fmla v10.2d,v16.2d,v6.d[0]                 \n\t" // Scale by alpha
+" fmla v11.2d,v17.2d,v6.d[0]                 \n\t" // Scale by alpha
+" fmla v12.2d,v18.2d,v6.d[0]                 \n\t" // Scale by alpha
+" fmla v13.2d,v19.2d,v6.d[0]                 \n\t" // Scale by alpha
+"                                            \n\t"
+" str q8, [x21]                              \n\t" //Store column 2 of C
+" str q9, [x21, #16]                         \n\t"
+" str q10, [x21, #32]                        \n\t"
+"                                            \n\t"
+" str q11, [x22]                             \n\t" //Store column 3 of C
+" str q12, [x22, #16]                        \n\t"
+" str q13, [x22, #32]                        \n\t"
+"                                            \n\t"
+" dup  v0.2d, xzr                            \n\t"
+" dup  v1.2d, xzr                            \n\t"
+" dup  v2.2d, xzr                            \n\t"
+" dup  v3.2d, xzr                            \n\t"
+" dup  v4.2d, xzr                            \n\t"
+" dup  v5.2d, xzr                            \n\t"
+"                                            \n\t"
+" fcmp d7,#0.0                               \n\t"
+" beq .DBETAZEROCOLSTOREDS3                  \n\t" // Taking care of the beta==0 case.
+"                                            \n\t"
+" ldr q0, [x23]                              \n\t" //Load column 4 of C
+" ldr q1, [x23, #16]                         \n\t"
+" ldr q2, [x23, #32]                         \n\t"
+"                                            \n\t"
+" ldr q3, [x24]                              \n\t" //Load column 5 of C
+" ldr q4, [x24, #16]                         \n\t"
+" ldr q5, [x24, #32]                         \n\t"
+"                                            \n\t"
+" fmul v0.2d,v0.2d,v7.d[0]                   \n\t" // Scale by beta
+" fmul v1.2d,v1.2d,v7.d[0]                   \n\t" // Scale by beta
+" fmul v2.2d,v2.2d,v7.d[0]                   \n\t" // Scale by beta
+" fmul v3.2d,v3.2d,v7.d[0]                   \n\t" // Scale by beta
+" fmul v4.2d,v4.2d,v7.d[0]                   \n\t" // Scale by beta
+" fmul v5.2d,v5.2d,v7.d[0]                   \n\t" // Scale by beta
+"                                            \n\t"
+" .DBETAZEROCOLSTOREDS3:                     \n\t"
+"                                            \n\t"
+" fmla v0.2d,v20.2d,v6.d[0]                  \n\t" // Scale by alpha
+" fmla v1.2d,v21.2d,v6.d[0]                  \n\t" // Scale by alpha
+" fmla v2.2d,v22.2d,v6.d[0]                  \n\t" // Scale by alpha
+" fmla v3.2d,v23.2d,v6.d[0]                  \n\t" // Scale by alpha
+" fmla v4.2d,v24.2d,v6.d[0]                  \n\t" // Scale by alpha
+" fmla v5.2d,v25.2d,v6.d[0]                  \n\t" // Scale by alpha
+"                                            \n\t"
+" str q0, [x23]                              \n\t" //Store column 4 of C
+" str q1, [x23, #16]                         \n\t"
+" str q2, [x23, #32]                         \n\t"
+"                                            \n\t"
+" str q3, [x24]                              \n\t" //Store column 5 of C
+" str q4, [x24, #16]                         \n\t"
+" str q5, [x24, #32]                         \n\t"
+"                                            \n\t"
+" dup  v8.2d, xzr                            \n\t"
+" dup  v9.2d, xzr                            \n\t"
+" dup  v10.2d, xzr                           \n\t"
+" dup  v11.2d, xzr                           \n\t"
+" dup  v12.2d, xzr                           \n\t"
+" dup  v13.2d, xzr                           \n\t"
+"                                            \n\t"
+" fcmp d7,#0.0                               \n\t"
+" beq .DBETAZEROCOLSTOREDS4                  \n\t" // Taking care of the beta==0 case.
+"                                            \n\t"
+" ldr q8, [x25]                              \n\t" //Load column 6 of C
+" ldr q9, [x25, #16]                         \n\t"
+" ldr q10, [x25, #32]                        \n\t"
+"                                            \n\t"
+" ldr q11, [x26]                             \n\t" //Load column 7 of C
+" ldr q12, [x26, #16]                        \n\t"
+" ldr q13, [x26, #32]                        \n\t"
+"                                            \n\t"
+" fmul v8.2d, v8.2d, v7.d[0]                 \n\t" // Scale by beta
+" fmul v9.2d, v9.2d, v7.d[0]                 \n\t" // Scale by beta
+" fmul v10.2d,v10.2d,v7.d[0]                 \n\t" // Scale by beta
+" fmul v11.2d,v11.2d,v7.d[0]                 \n\t" // Scale by beta
+" fmul v12.2d,v12.2d,v7.d[0]                 \n\t" // Scale by beta
+" fmul v13.2d,v13.2d,v7.d[0]                 \n\t" // Scale by beta
+"                                            \n\t"
+" .DBETAZEROCOLSTOREDS4:                     \n\t"
+"                                            \n\t"
+" prfm pldl2keep,[x3]                        \n\t"
+" prfm pldl2keep,[x4]                        \n\t"
+"                                            \n\t"
+" fmla v8.2d, v26.2d,v6.d[0]                 \n\t" // Scale by alpha
+" fmla v9.2d, v27.2d,v6.d[0]                 \n\t" // Scale by alpha
+" fmla v10.2d,v28.2d,v6.d[0]                 \n\t" // Scale by alpha
+" fmla v11.2d,v29.2d,v6.d[0]                 \n\t" // Scale by alpha
+" fmla v12.2d,v30.2d,v6.d[0]                 \n\t" // Scale by alpha
+" fmla v13.2d,v31.2d,v6.d[0]                 \n\t" // Scale by alpha
+"                                            \n\t"
+" str q8, [x25]                              \n\t" //Store column 6 of C
+" str q9, [x25, #16]                         \n\t"
+" str q10, [x25, #32]                        \n\t"
+"                                            \n\t"
+" str q11, [x26]                             \n\t" //Store column 7 of C
+" str q12, [x26, #16]                        \n\t"
+" str q13, [x26, #32]                        \n\t"
+"                                            \n\t"
+" b .DEND                                    \n\t"
 "                                            \n\t"
 " .DGENSTORED:                               \n\t" // C is general-stride stored.
 "                                            \n\t"
-" fcmp d31,#0.0                              \n\t"
-" beq .DBETAZEROGENSTORED                    \n\t"
-"                                            \n\t" // If beta!=0, then we can read from C.
-"                                            \n\t" // TODO: this was done fast. Rearrange to remove so many address reloads.
-" ldr x2,%[caddr]                            \n\t" // Load address of C.
+" dup  v0.2d, xzr                            \n\t"
+" dup  v1.2d, xzr                            \n\t"
+" dup  v2.2d, xzr                            \n\t"
+" dup  v3.2d, xzr                            \n\t"
+" dup  v4.2d, xzr                            \n\t"
+" dup  v5.2d, xzr                            \n\t"
 "                                            \n\t"
-" ld1 {v0.d}[0],[x2],x14                     \n\t" // Load c00  into quad and increment by rs_c.
-" ld1 {v0.d}[1],[x2],x14                     \n\t" // Load c01  into quad and increment by rs_c.
-" ld1 {v1.d}[0],[x2],x14                     \n\t" // Load c02  into quad and increment by rs_c.
-" ld1 {v1.d}[1],[x2],x14                     \n\t" // Load c03  into quad and increment by rs_c.
+" fcmp d7,#0.0                               \n\t"
+" beq .DBETAZEROGENSTOREDS1                  \n\t" // Taking care of the beta==0 case.
 "                                            \n\t"
-" ldr x2,%[caddr]                            \n\t" // Load address of C.
-" add x2,x2,x10                              \n\t" // c += cs_c.
+" mov x27, x2                                \n\t"
+"                                            \n\t" // Load address of C.
+" ld1 {v0.d}[0],[x27],x14                    \n\t" // Load c00  into quad and increment by rs_c.
+" ld1 {v0.d}[1],[x27],x14                    \n\t" // Load c01  into quad and increment by rs_c.
+" ld1 {v1.d}[0],[x27],x14                    \n\t" // Load c02  into quad and increment by rs_c.
+" ld1 {v1.d}[1],[x27],x14                    \n\t" // Load c03  into quad and increment by rs_c.
+" ld1 {v2.d}[0],[x27],x14                    \n\t" // Load c04  into quad and increment by rs_c.
+" ld1 {v2.d}[1],[x27],x14                    \n\t" // Load c05  into quad and increment by rs_c.
 "                                            \n\t"
-" ld1 {v2.d}[0],[x2],x14                     \n\t" // Load c10  into quad and increment by rs_c.
-" ld1 {v2.d}[1],[x2],x14                     \n\t" // Load c11  into quad and increment by rs_c.
-" ld1 {v3.d}[0],[x2],x14                     \n\t" // Load c12  into quad and increment by rs_c.
-" ld1 {v3.d}[1],[x2],x14                     \n\t" // Load c13  into quad and increment by rs_c.
+" mov x27, x20                               \n\t" // Load address of C.
 "                                            \n\t"
-" ldr x2,%[caddr]                            \n\t" // Load address of C.
-" add x2,x2,x10                              \n\t" // c += cs_c.
-" add x2,x2,x10                              \n\t" // c += cs_c.
+" ld1 {v3.d}[0],[x27],x14                    \n\t" // Load c10  into quad and increment by rs_c.
+" ld1 {v3.d}[1],[x27],x14                    \n\t" // Load c11  into quad and increment by rs_c.
+" ld1 {v4.d}[0],[x27],x14                    \n\t" // Load c12  into quad and increment by rs_c.
+" ld1 {v4.d}[1],[x27],x14                    \n\t" // Load c13  into quad and increment by rs_c.
+" ld1 {v5.d}[0],[x27],x14                    \n\t" // Load c14  into quad and increment by rs_c.
+" ld1 {v5.d}[1],[x27],x14                    \n\t" // Load c15  into quad and increment by rs_c.
 "                                            \n\t"
-" ld1 {v4.d}[0],[x2],x14                    \n\t" // Load c10  into quad and increment by rs_c.
-" ld1 {v4.d}[1],[x2],x14                    \n\t" // Load c11  into quad and increment by rs_c.
-" ld1 {v5.d}[0],[x2],x14                    \n\t" // Load c12  into quad and increment by rs_c.
-" ld1 {v5.d}[1],[x2],x14                    \n\t" // Load c13  into quad and increment by rs_c.
+" fmul v0.2d,v0.2d,v7.d[0]                   \n\t" // Scale by beta
+" fmul v1.2d,v1.2d,v7.d[0]                   \n\t" // Scale by beta
+" fmul v2.2d,v2.2d,v7.d[0]                   \n\t" // Scale by beta
+" fmul v3.2d,v3.2d,v7.d[0]                   \n\t" // Scale by beta
+" fmul v4.2d,v4.2d,v7.d[0]                   \n\t" // Scale by beta
+" fmul v5.2d,v5.2d,v7.d[0]                   \n\t" // Scale by beta
 "                                            \n\t"
-" ldr x2,%[caddr]                            \n\t" // Load address of C.
-" add x2,x2,x10                              \n\t" // c += cs_c.
-" add x2,x2,x10                              \n\t" // c += cs_c.
-" add x2,x2,x10                              \n\t" // c += cs_c.
+" .DBETAZEROGENSTOREDS1:                     \n\t"
 "                                            \n\t"
-" ld1 {v6.d}[0],[x2],x14                    \n\t" // Load c10  into quad and increment by rs_c.
-" ld1 {v6.d}[1],[x2],x14                    \n\t" // Load c11  into quad and increment by rs_c.
-" ld1 {v7.d}[0],[x2],x14                    \n\t" // Load c12  into quad and increment by rs_c.
-" ld1 {v7.d}[1],[x2],x14                    \n\t" // Load c13  into quad and increment by rs_c.
+" fmla v0.2d,v8.2d,v6.d[0]                   \n\t" // Scale by alpha
+" fmla v1.2d,v9.2d,v6.d[0]                   \n\t" // Scale by alpha
+" fmla v2.2d,v10.2d,v6.d[0]                  \n\t" // Scale by alpha
+" fmla v3.2d,v11.2d,v6.d[0]                  \n\t" // Scale by alpha
+" fmla v4.2d,v12.2d,v6.d[0]                  \n\t" // Scale by alpha
+" fmla v5.2d,v13.2d,v6.d[0]                  \n\t" // Scale by alpha
 "                                            \n\t"
-" prfm pldl1keep,[x16,0]                     \n\t" // Prefetch.
-" prfm pldl1keep,[x17,0]                     \n\t" // Prefetch.
+" mov x27, x2                                \n\t" // Load address of C.
 "                                            \n\t"
-" fmul v20.2d,v0.2d,v31.d[0]                 \n\t" // Scale by beta
-" fmul v21.2d,v1.2d,v31.d[0]                 \n\t" // Scale by beta
-" fmul v22.2d,v2.2d,v31.d[0]                 \n\t" // Scale by beta
-" fmul v23.2d,v3.2d,v31.d[0]                 \n\t" // Scale by beta
-" fmul v24.2d,v4.2d,v31.d[0]                 \n\t" // Scale by beta
-" fmul v25.2d,v5.2d,v31.d[0]                 \n\t" // Scale by beta
-" fmul v26.2d,v6.2d,v31.d[0]                 \n\t" // Scale by beta
-" fmul v27.2d,v7.2d,v31.d[0]                 \n\t" // Scale by beta
+" st1 {v0.d}[0],[x27],x14                    \n\t" // Store c00  into quad and increment by rs_c.
+" st1 {v0.d}[1],[x27],x14                    \n\t" // Store c01  into quad and increment by rs_c.
+" st1 {v1.d}[0],[x27],x14                    \n\t" // Store c02  into quad and increment by rs_c.
+" st1 {v1.d}[1],[x27],x14                    \n\t" // Store c03  into quad and increment by rs_c.
+" st1 {v2.d}[0],[x27],x14                    \n\t" // Store c04  into quad and increment by rs_c.
+" st1 {v2.d}[1],[x27],x14                    \n\t" // Store c05  into quad and increment by rs_c.
 "                                            \n\t"
-" .DBETAZEROGENSTORED:                       \n\t" // If beta==0, we cannot read from C (nor scale).
+" mov x27, x20                               \n\t" // Load address of C.
 "                                            \n\t"
-" fmla v20.2d,v12.2d,v30.d[0]                \n\t" // Scale by alpha
-" fmla v21.2d,v13.2d,v30.d[0]                \n\t" // Scale by alpha
-" fmla v22.2d,v14.2d,v30.d[0]                \n\t" // Scale by alpha
-" fmla v23.2d,v15.2d,v30.d[0]                \n\t" // Scale by alpha
-" fmla v24.2d,v16.2d,v30.d[0]                \n\t" // Scale by alpha
-" fmla v25.2d,v17.2d,v30.d[0]                \n\t" // Scale by alpha
-" fmla v26.2d,v18.2d,v30.d[0]                \n\t" // Scale by alpha
-" fmla v27.2d,v19.2d,v30.d[0]                \n\t" // Scale by alpha
+" st1 {v3.d}[0],[x27],x14                    \n\t" // Store c10  into quad and increment by rs_c.
+" st1 {v3.d}[1],[x27],x14                    \n\t" // Store c11  into quad and increment by rs_c.
+" st1 {v4.d}[0],[x27],x14                    \n\t" // Store c12  into quad and increment by rs_c.
+" st1 {v4.d}[1],[x27],x14                    \n\t" // Store c13  into quad and increment by rs_c.
+" st1 {v5.d}[0],[x27],x14                    \n\t" // Store c14  into quad and increment by rs_c.
+" st1 {v5.d}[1],[x27],x14                    \n\t" // Store c15  into quad and increment by rs_c.
 "                                            \n\t"
-" ldr x2,%[caddr]                            \n\t" // Load address of C.
+" dup  v8.2d, xzr                            \n\t"
+" dup  v9.2d, xzr                            \n\t"
+" dup  v10.2d, xzr                           \n\t"
+" dup  v11.2d, xzr                           \n\t"
+" dup  v12.2d, xzr                           \n\t"
+" dup  v13.2d, xzr                           \n\t"
 "                                            \n\t"
-" st1 {v20.d}[0],[x2],x14                     \n\t" // Store c00  into quad and increment by rs_c.
-" st1 {v20.d}[1],[x2],x14                     \n\t" // Store c01  into quad and increment by rs_c.
-" st1 {v21.d}[0],[x2],x14                     \n\t" // Store c02  into quad and increment by rs_c.
-" st1 {v21.d}[1],[x2],x14                     \n\t" // Store c03  into quad and increment by rs_c.
+" fcmp d7,#0.0                               \n\t"
+" beq .DBETAZEROGENSTOREDS2                  \n\t" // Taking care of the beta==0 case.
 "                                            \n\t"
-" ldr x2,%[caddr]                            \n\t" // Load address of C.
-" add x2,x2,x10                              \n\t" // c += cs_c.
+" mov x27, x21                               \n\t" // Load address of C.
 "                                            \n\t"
-" st1 {v22.d}[0],[x2],x14                     \n\t" // Store c10  into quad and increment by rs_c.
-" st1 {v22.d}[1],[x2],x14                     \n\t" // Store c11  into quad and increment by rs_c.
-" st1 {v23.d}[0],[x2],x14                     \n\t" // Store c12  into quad and increment by rs_c.
-" st1 {v23.d}[1],[x2],x14                     \n\t" // Store c13  into quad and increment by rs_c.
+" ld1 {v8.d}[0], [x27],x14                   \n\t" // Load c20  into quad and increment by rs_c.
+" ld1 {v8.d}[1], [x27],x14                   \n\t" // Load c21  into quad and increment by rs_c.
+" ld1 {v9.d}[0], [x27],x14                   \n\t" // Load c22  into quad and increment by rs_c.
+" ld1 {v9.d}[1], [x27],x14                   \n\t" // Load c23  into quad and increment by rs_c.
+" ld1 {v10.d}[0],[x27],x14                   \n\t" // Load c24  into quad and increment by rs_c.
+" ld1 {v10.d}[1],[x27],x14                   \n\t" // Load c25  into quad and increment by rs_c.
 "                                            \n\t"
-" ldr x2,%[caddr]                            \n\t" // Load address of C.
-" add x2,x2,x10                              \n\t" // c += cs_c.
-" add x2,x2,x10                              \n\t" // c += cs_c.
+" mov x27, x22                               \n\t" // Load address of C.
 "                                            \n\t"
-" st1 {v24.d}[0],[x2],x14                    \n\t" // Store c10  into quad and increment by rs_c.
-" st1 {v24.d}[1],[x2],x14                    \n\t" // Store c11  into quad and increment by rs_c.
-" st1 {v25.d}[0],[x2],x14                    \n\t" // Store c12  into quad and increment by rs_c.
-" st1 {v25.d}[1],[x2],x14                    \n\t" // Store c13  into quad and increment by rs_c.
+" ld1 {v11.d}[0],[x27],x14                   \n\t" // Load c30  into quad and increment by rs_c.
+" ld1 {v11.d}[1],[x27],x14                   \n\t" // Load c31  into quad and increment by rs_c.
+" ld1 {v12.d}[0],[x27],x14                   \n\t" // Load c32  into quad and increment by rs_c.
+" ld1 {v12.d}[1],[x27],x14                   \n\t" // Load c33  into quad and increment by rs_c.
+" ld1 {v13.d}[0],[x27],x14                   \n\t" // Load c34  into quad and increment by rs_c.
+" ld1 {v13.d}[1],[x27],x14                   \n\t" // Load c35  into quad and increment by rs_c.
 "                                            \n\t"
-" ldr x2,%[caddr]                            \n\t" // Load address of C.
-" add x2,x2,x10                              \n\t" // c += cs_c.
-" add x2,x2,x10                              \n\t" // c += cs_c.
-" add x2,x2,x10                              \n\t" // c += cs_c.
+" fmul v8.2d, v8.2d, v7.d[0]                 \n\t" // Scale by beta
+" fmul v9.2d, v9.2d, v7.d[0]                 \n\t" // Scale by beta
+" fmul v10.2d,v10.2d,v7.d[0]                 \n\t" // Scale by beta
+" fmul v11.2d,v11.2d,v7.d[0]                 \n\t" // Scale by beta
+" fmul v12.2d,v12.2d,v7.d[0]                 \n\t" // Scale by beta
+" fmul v13.2d,v13.2d,v7.d[0]                 \n\t" // Scale by beta
 "                                            \n\t"
-" st1 {v26.d}[0],[x2],x14                    \n\t" // Store c10  into quad and increment by rs_c.
-" st1 {v26.d}[1],[x2],x14                    \n\t" // Store c11  into quad and increment by rs_c.
-" st1 {v27.d}[0],[x2],x14                    \n\t" // Store c12  into quad and increment by rs_c.
-" st1 {v27.d}[1],[x2],x14                    \n\t" // Store c13  into quad and increment by rs_c.
+" .DBETAZEROGENSTOREDS2:                     \n\t"
+"                                            \n\t"
+" fmla v8.2d, v14.2d,v6.d[0]                 \n\t" // Scale by alpha
+" fmla v9.2d, v15.2d,v6.d[0]                 \n\t" // Scale by alpha
+" fmla v10.2d,v16.2d,v6.d[0]                 \n\t" // Scale by alpha
+" fmla v11.2d,v17.2d,v6.d[0]                 \n\t" // Scale by alpha
+" fmla v12.2d,v18.2d,v6.d[0]                 \n\t" // Scale by alpha
+" fmla v13.2d,v19.2d,v6.d[0]                 \n\t" // Scale by alpha
+"                                            \n\t"
+" mov x27, x21                               \n\t" // Load address of C.
+"                                            \n\t"
+" st1 {v8.d}[0], [x27],x14                   \n\t" // Store c20  into quad and increment by rs_c.
+" st1 {v8.d}[1], [x27],x14                   \n\t" // Store c21  into quad and increment by rs_c.
+" st1 {v9.d}[0], [x27],x14                   \n\t" // Store c22  into quad and increment by rs_c.
+" st1 {v9.d}[1], [x27],x14                   \n\t" // Store c23  into quad and increment by rs_c.
+" st1 {v10.d}[0],[x27],x14                   \n\t" // Store c24  into quad and increment by rs_c.
+" st1 {v10.d}[1],[x27],x14                   \n\t" // Store c25  into quad and increment by rs_c.
+"                                            \n\t"
+" mov x27, x22                               \n\t" // Load address of C.
+"                                            \n\t"
+" st1 {v11.d}[0],[x27],x14                   \n\t" // Store c30  into quad and increment by rs_c.
+" st1 {v11.d}[1],[x27],x14                   \n\t" // Store c31  into quad and increment by rs_c.
+" st1 {v12.d}[0],[x27],x14                   \n\t" // Store c32  into quad and increment by rs_c.
+" st1 {v12.d}[1],[x27],x14                   \n\t" // Store c33  into quad and increment by rs_c.
+" st1 {v13.d}[0],[x27],x14                   \n\t" // Store c34  into quad and increment by rs_c.
+" st1 {v13.d}[1],[x27],x14                   \n\t" // Store c35  into quad and increment by rs_c.
+"                                            \n\t"
+" dup  v0.2d, xzr                            \n\t"
+" dup  v1.2d, xzr                            \n\t"
+" dup  v2.2d, xzr                            \n\t"
+" dup  v3.2d, xzr                            \n\t"
+" dup  v4.2d, xzr                            \n\t"
+" dup  v5.2d, xzr                            \n\t"
+"                                            \n\t"
+" fcmp d7,#0.0                               \n\t"
+" beq .DBETAZEROGENSTOREDS3                  \n\t" // Taking care of the beta==0 case.
+"                                            \n\t"
+" mov x27, x23                               \n\t" // Load address of C.
+"                                            \n\t"
+" ld1 {v0.d}[0],[x27],x14                    \n\t" // Load c40  into quad and increment by rs_c.
+" ld1 {v0.d}[1],[x27],x14                    \n\t" // Load c41  into quad and increment by rs_c.
+" ld1 {v1.d}[0],[x27],x14                    \n\t" // Load c42  into quad and increment by rs_c.
+" ld1 {v1.d}[1],[x27],x14                    \n\t" // Load c43  into quad and increment by rs_c.
+" ld1 {v2.d}[0],[x27],x14                    \n\t" // Load c44  into quad and increment by rs_c.
+" ld1 {v2.d}[1],[x27],x14                    \n\t" // Load c45  into quad and increment by rs_c.
+"                                            \n\t"
+" mov x27, x24                               \n\t" // Load address of C.
+"                                            \n\t"
+" ld1 {v3.d}[0],[x27],x14                    \n\t" // Load c50  into quad and increment by rs_c.
+" ld1 {v3.d}[1],[x27],x14                    \n\t" // Load c51  into quad and increment by rs_c.
+" ld1 {v4.d}[0],[x27],x14                    \n\t" // Load c52  into quad and increment by rs_c.
+" ld1 {v4.d}[1],[x27],x14                    \n\t" // Load c53  into quad and increment by rs_c.
+" ld1 {v5.d}[0],[x27],x14                    \n\t" // Load c54  into quad and increment by rs_c.
+" ld1 {v5.d}[1],[x27],x14                    \n\t" // Load c55  into quad and increment by rs_c.
+"                                            \n\t"
+" fmul v0.2d,v0.2d,v7.d[0]                   \n\t" // Scale by beta
+" fmul v1.2d,v1.2d,v7.d[0]                   \n\t" // Scale by beta
+" fmul v2.2d,v2.2d,v7.d[0]                   \n\t" // Scale by beta
+" fmul v3.2d,v3.2d,v7.d[0]                   \n\t" // Scale by beta
+" fmul v4.2d,v4.2d,v7.d[0]                   \n\t" // Scale by beta
+" fmul v5.2d,v5.2d,v7.d[0]                   \n\t" // Scale by beta
+"                                            \n\t"
+" .DBETAZEROGENSTOREDS3:                     \n\t"
+"                                            \n\t"
+" fmla v0.2d,v20.2d,v6.d[0]                  \n\t" // Scale by alpha
+" fmla v1.2d,v21.2d,v6.d[0]                  \n\t" // Scale by alpha
+" fmla v2.2d,v22.2d,v6.d[0]                  \n\t" // Scale by alpha
+" fmla v3.2d,v23.2d,v6.d[0]                  \n\t" // Scale by alpha
+" fmla v4.2d,v24.2d,v6.d[0]                  \n\t" // Scale by alpha
+" fmla v5.2d,v25.2d,v6.d[0]                  \n\t" // Scale by alpha
+"                                            \n\t"
+" mov x27, x23                               \n\t" // Load address of C.
+"                                            \n\t"
+" st1 {v0.d}[0],[x27],x14                    \n\t" // Store c40  into quad and increment by rs_c.
+" st1 {v0.d}[1],[x27],x14                    \n\t" // Store c41  into quad and increment by rs_c.
+" st1 {v1.d}[0],[x27],x14                    \n\t" // Store c42  into quad and increment by rs_c.
+" st1 {v1.d}[1],[x27],x14                    \n\t" // Store c43  into quad and increment by rs_c.
+" st1 {v2.d}[0],[x27],x14                    \n\t" // Store c44  into quad and increment by rs_c.
+" st1 {v2.d}[1],[x27],x14                    \n\t" // Store c45  into quad and increment by rs_c.
+"                                            \n\t"
+" mov x27, x24                               \n\t" // Load address of C.
+"                                            \n\t"
+" st1 {v3.d}[0],[x27],x14                    \n\t" // Store c50  into quad and increment by rs_c.
+" st1 {v3.d}[1],[x27],x14                    \n\t" // Store c51  into quad and increment by rs_c.
+" st1 {v4.d}[0],[x27],x14                    \n\t" // Store c52  into quad and increment by rs_c.
+" st1 {v4.d}[1],[x27],x14                    \n\t" // Store c53  into quad and increment by rs_c.
+" st1 {v5.d}[0],[x27],x14                    \n\t" // Store c54  into quad and increment by rs_c.
+" st1 {v5.d}[1],[x27],x14                    \n\t" // Store c55  into quad and increment by rs_c.
+"                                            \n\t"
+" dup  v8.2d, xzr                            \n\t"
+" dup  v9.2d, xzr                            \n\t"
+" dup  v10.2d, xzr                           \n\t"
+" dup  v11.2d, xzr                           \n\t"
+" dup  v12.2d, xzr                           \n\t"
+" dup  v13.2d, xzr                           \n\t"
+"                                            \n\t"
+" fcmp d7,#0.0                               \n\t"
+" beq .DBETAZEROGENSTOREDS4                  \n\t" // Taking care of the beta==0 case.
+"                                            \n\t"
+" mov x27, x25                               \n\t"
+"                                            \n\t"
+" ld1 {v8.d}[0], [x27],x14                   \n\t" // Load c60  into quad and increment by rs_c.
+" ld1 {v8.d}[1], [x27],x14                   \n\t" // Load c61  into quad and increment by rs_c.
+" ld1 {v9.d}[0], [x27],x14                   \n\t" // Load c62  into quad and increment by rs_c.
+" ld1 {v9.d}[1], [x27],x14                   \n\t" // Load c63  into quad and increment by rs_c.
+" ld1 {v10.d}[0],[x27],x14                   \n\t" // Load c64  into quad and increment by rs_c.
+" ld1 {v10.d}[1],[x27],x14                   \n\t" // Load c65  into quad and increment by rs_c.
+"                                            \n\t"
+" mov x27, x26                               \n\t" // Load address of C.
+"                                            \n\t"
+" ld1 {v11.d}[0],[x27],x14                   \n\t" // Load c70  into quad and increment by rs_c.
+" ld1 {v11.d}[1],[x27],x14                   \n\t" // Load c71  into quad and increment by rs_c.
+" ld1 {v12.d}[0],[x27],x14                   \n\t" // Load c72  into quad and increment by rs_c.
+" ld1 {v12.d}[1],[x27],x14                   \n\t" // Load c73  into quad and increment by rs_c.
+" ld1 {v13.d}[0],[x27],x14                   \n\t" // Load c74  into quad and increment by rs_c.
+" ld1 {v13.d}[1],[x27],x14                   \n\t" // Load c75  into quad and increment by rs_c.
+"                                            \n\t"
+" fmul v8.2d, v8.2d, v7.d[0]                 \n\t" // Scale by beta
+" fmul v9.2d, v9.2d, v7.d[0]                 \n\t" // Scale by beta
+" fmul v10.2d,v10.2d,v7.d[0]                 \n\t" // Scale by beta
+" fmul v11.2d,v11.2d,v7.d[0]                 \n\t" // Scale by beta
+" fmul v12.2d,v12.2d,v7.d[0]                 \n\t" // Scale by beta
+" fmul v13.2d,v13.2d,v7.d[0]                 \n\t" // Scale by beta
+"                                            \n\t"
+" .DBETAZEROGENSTOREDS4:                     \n\t"
+"                                            \n\t"
+" prfm pldl2keep,[x3]                        \n\t"
+" prfm pldl2keep,[x4]                        \n\t"
+"                                            \n\t"
+" fmla v8.2d, v26.2d,v6.d[0]                 \n\t" // Scale by alpha
+" fmla v9.2d, v27.2d,v6.d[0]                 \n\t" // Scale by alpha
+" fmla v10.2d,v28.2d,v6.d[0]                 \n\t" // Scale by alpha
+" fmla v11.2d,v29.2d,v6.d[0]                 \n\t" // Scale by alpha
+" fmla v12.2d,v30.2d,v6.d[0]                 \n\t" // Scale by alpha
+" fmla v13.2d,v31.2d,v6.d[0]                 \n\t" // Scale by alpha
+"                                            \n\t"
+" mov x27, x25                               \n\t" // Load address of C.
+"                                            \n\t"
+" st1 {v8.d}[0], [x27],x14                   \n\t" // Store c60  into quad and increment by rs_c.
+" st1 {v8.d}[1], [x27],x14                   \n\t" // Store c61  into quad and increment by rs_c.
+" st1 {v9.d}[0], [x27],x14                   \n\t" // Store c62  into quad and increment by rs_c.
+" st1 {v9.d}[1], [x27],x14                   \n\t" // Store c63  into quad and increment by rs_c.
+" st1 {v10.d}[0],[x27],x14                   \n\t" // Store c64  into quad and increment by rs_c.
+" st1 {v10.d}[1],[x27],x14                   \n\t" // Store c65  into quad and increment by rs_c.
+"                                            \n\t"
+" mov x27, x26                               \n\t" // Load address of C.
+"                                            \n\t"
+" st1 {v11.d}[0],[x27],x14                   \n\t" // Store c70  into quad and increment by rs_c.
+" st1 {v11.d}[1],[x27],x14                   \n\t" // Store c71  into quad and increment by rs_c.
+" st1 {v12.d}[0],[x27],x14                   \n\t" // Store c72  into quad and increment by rs_c.
+" st1 {v12.d}[1],[x27],x14                   \n\t" // Store c73  into quad and increment by rs_c.
+" st1 {v13.d}[0],[x27],x14                   \n\t" // Store c74  into quad and increment by rs_c.
+" st1 {v13.d}[1],[x27],x14                   \n\t" // Store c75  into quad and increment by rs_c.
 "                                            \n\t"
 " .DEND:                                     \n\t" // Done!
 "                                            \n\t"
@@ -784,10 +2047,12 @@ __asm__ volatile
  [a_next] "m" (a_next), // 8
  [b_next] "m" (b_next)  // 9
 :// Register clobber list
- "x0","x1","x2",
+ "x0","x1","x2","x3",
  "x4","x5","x6",
  "x7","x8","x9",
  "x10","x11","x12","x13","x14","x16","x17",
+ "x20","x21","x22","x23","x24","x25","x26",
+ "x27",       
  "v0","v1","v2",
  "v3","v4","v5",
  "v6","v7","v8",
@@ -796,7 +2061,7 @@ __asm__ volatile
  "v15","v16","v17","v18","v19",
  "v20","v21","v22","v23",
  "v24","v25","v26","v27",
- "v30","v31"
+ "v28","v29","v30","v31"
 );
 
 

From 2bd036f1f9ce1ee0864365557f66d9415dd42de3 Mon Sep 17 00:00:00 2001
From: Devin Matthews <dmatthews@utexas.edu>
Date: Fri, 25 Mar 2016 12:16:49 -0500
Subject: [PATCH 04/10] Fix configuration issue where instruction set flags are
 not specified for debug builds.

---
 config/armv7a/make_defs.mk      | 23 +++++++++++++++++++----
 config/armv8a/make_defs.mk      | 24 +++++++++++++++++++-----
 config/bulldozer/make_defs.mk   | 21 ++++++++++++++++++---
 config/carrizo/make_defs.mk     | 21 ++++++++++++++++++---
 config/cortex-a15/make_defs.mk  |  7 ++-----
 config/cortex-a9/make_defs.mk   |  7 ++-----
 config/dunnington/make_defs.mk  |  7 ++-----
 config/haswell/make_defs.mk     |  7 ++-----
 config/loongson3a/make_defs.mk  |  7 ++-----
 config/mic/make_defs.mk         |  3 ---
 config/piledriver/make_defs.mk  |  7 ++-----
 config/power7/make_defs.mk      |  3 ---
 config/reference/make_defs.mk   |  3 ---
 config/sandybridge/make_defs.mk |  7 ++-----
 config/template/make_defs.mk    |  3 ---
 configure                       |  3 ++-
 16 files changed, 90 insertions(+), 63 deletions(-)

diff --git a/config/armv7a/make_defs.mk b/config/armv7a/make_defs.mk
index a28ad0878..ff1d84b0d 100644
--- a/config/armv7a/make_defs.mk
+++ b/config/armv7a/make_defs.mk
@@ -76,17 +76,32 @@ GIT_LOG    := $(GIT) log --decorate
 #
 
 # --- Determine the C compiler and related flags ---
+ifeq ($(CC),)
 CC             := gcc
+CC_VENDOR      := gcc
+endif
+ifneq ($(CC_VENDOR),gcc)
+$(error gcc is required for this configuration.)
+endif
 # Enable IEEE Standard 1003.1-2004 (POSIX.1d). 
 # NOTE: This is needed to enable posix_memalign().
 CPPROCFLAGS    := -D_POSIX_C_SOURCE=200112L
-CMISCFLAGS     := -std=c99 -O3 -mfloat-abi=hard -mfpu=vfpv3 -marm -march=armv7-a #-g
+CMISCFLAGS     := -std=c99 -mfloat-abi=hard
 CPICFLAGS      := -fPIC
-CDBGFLAGS      := #-g
 CWARNFLAGS     := -Wall
-COPTFLAGS      := -marm -march=armv7-a -mfpu=vfpv3 -O3 -mfloat-abi=hard #-g
+
+ifneq ($(DEBUG_TYPE),off)
+CDBGFLAGS      := -g
+endif
+
+ifeq ($(DEBUG_TYPE),noopt)
+COPTFLAGS      := -O0
+else
+COPTFLAGS      := -O3
+endif
+
+CVECFLAGS      := -mfpu=vfpv3 -marm -march=armv7-a
 CKOPTFLAGS     := $(COPTFLAGS)
-CVECFLAGS      := #-msse3  # -mfpmath=sse
 
 # Aggregate all of the flags into multiple groups: one for standard
 # compilation, and one for each of the supported "special" compilation
diff --git a/config/armv8a/make_defs.mk b/config/armv8a/make_defs.mk
index 63c03c6a0..3cec43c92 100644
--- a/config/armv8a/make_defs.mk
+++ b/config/armv8a/make_defs.mk
@@ -76,18 +76,32 @@ GIT_LOG    := $(GIT) log --decorate
 #
 
 # --- Determine the C compiler and related flags ---
+ifeq ($(CC),)
 CC             := gcc
-
+CC_VENDOR      := gcc
+endif
+ifneq ($(CC_VENDOR),gcc)
+$(error gcc is required for this configuration.)
+endif
 # Enable IEEE Standard 1003.1-2004 (POSIX.1d). 
 # NOTE: This is needed to enable posix_memalign().
 CPPROCFLAGS    := -D_GNU_SOURCE
-CMISCFLAGS     := -std=c99 -march=armv8-a+fp+simd -ftree-vectorize -O3 -fopenmp -mcpu=cortex-a57.cortex-a53 -mtune=cortex-a57.cortex-a53
+CMISCFLAGS     := -std=c99 -fopenmp
 CPICFLAGS      := -fPIC
-CDBGFLAGS      := -g #-g3 -gdwarf-2 
 CWARNFLAGS     := -Wall
-COPTFLAGS      := -march=armv8-a+fp+simd -ftree-vectorize -O3 -mcpu=cortex-a57.cortex-a53 -mtune=cortex-a57.cortex-a53
+
+ifneq ($(DEBUG_TYPE),off)
+CDBGFLAGS      := -g
+endif
+
+ifeq ($(DEBUG_TYPE),noopt)
+COPTFLAGS      := -O0
+else
+COPTFLAGS      := -O3 -ftree-vectorize -mtune=cortex-a57.cortex-a53
+endif
+
+CVECFLAGS      := -march=armv8-a+fp+simd -mcpu=cortex-a57.cortex-a53
 CKOPTFLAGS     := $(COPTFLAGS)
-CVECFLAGS      := #-march=armv8-a+fp+simd -ftree-vectorize -O3 -mcpu=cortex-a57.cortex-a53 -mtune=cortex-a57.cortex-a53 #-march=armv8-a -O2 -mtune=cortex-a57 -mfpu=neon-fp-armv8 
 
 # Aggregate all of the flags into multiple groups: one for standard
 # compilation, and one for each of the supported "special" compilation
diff --git a/config/bulldozer/make_defs.mk b/config/bulldozer/make_defs.mk
index 13e306a02..772ac1c53 100644
--- a/config/bulldozer/make_defs.mk
+++ b/config/bulldozer/make_defs.mk
@@ -76,17 +76,32 @@ GIT_LOG    := $(GIT) log --decorate
 #
 
 # --- Determine the C compiler and related flags ---
+ifeq ($(CC),)
 CC             := gcc
+CC_VENDOR      := gcc
+endif
+ifneq ($(CC_VENDOR),gcc)
+$(error gcc is required for this configuration.)
+endif
 # Enable IEEE Standard 1003.1-2004 (POSIX.1d). 
 # NOTE: This is needed to enable posix_memalign().
 CPPROCFLAGS    := -D_POSIX_C_SOURCE=200112L
 CMISCFLAGS     := -std=c99 -fopenmp
 CPICFLAGS      := -fPIC
-CDBGFLAGS      := -g
 CWARNFLAGS     := -Wall
-COPTFLAGS      := -O0 -malign-double -funroll-all-loops
-CKOPTFLAGS     := $(COPTFLAGS)
+
+ifneq ($(DEBUG_TYPE),off)
+CDBGFLAGS      := -g
+endif
+
+ifeq ($(DEBUG_TYPE),noopt)
+COPTFLAGS      := -O0
+else
+COPTFLAGS      := -O2 -malign-double -funroll-all-loops
+endif
+
 CVECFLAGS      := -mavx -mfma -march=bdver2 -mfpmath=sse
+CKOPTFLAGS     := $(COPTFLAGS)
 
 # Aggregate all of the flags into multiple groups: one for standard
 # compilation, and one for each of the supported "special" compilation
diff --git a/config/carrizo/make_defs.mk b/config/carrizo/make_defs.mk
index 5f5303ade..08a5baaaf 100644
--- a/config/carrizo/make_defs.mk
+++ b/config/carrizo/make_defs.mk
@@ -76,17 +76,32 @@ GIT_LOG    := $(GIT) log --decorate
 #
 
 # --- Determine the C compiler and related flags ---
+ifeq ($(CC),)
 CC             := gcc
+CC_VENDOR      := gcc
+endif
+ifneq ($(CC_VENDOR),gcc)
+$(error gcc is required for this configuration.)
+endif
 # Enable IEEE Standard 1003.1-2004 (POSIX.1d). 
 # NOTE: This is needed to enable posix_memalign().
 CPPROCFLAGS    := -D_POSIX_C_SOURCE=200112L
 CMISCFLAGS     := -std=c99 -fopenmp
 CPICFLAGS      := -fPIC
-CDBGFLAGS      := #-g
 CWARNFLAGS     := -Wall
-COPTFLAGS      := -O2 -mfpmath=sse -fomit-frame-pointer
+
+ifneq ($(DEBUG_TYPE),off)
+CDBGFLAGS      := -g
+endif
+
+ifeq ($(DEBUG_TYPE),noopt)
+COPTFLAGS      := -O0
+else
+COPTFLAGS      := -O2 -fomit-frame-pointer
+endif
+
+CVECFLAGS      := -mavx -mfma -march=native -mfpmath=sse
 CKOPTFLAGS     := $(COPTFLAGS)
-CVECFLAGS      := -mavx -mfma -march=native
 
 # Aggregate all of the flags into multiple groups: one for standard
 # compilation, and one for each of the supported "special" compilation
diff --git a/config/cortex-a15/make_defs.mk b/config/cortex-a15/make_defs.mk
index 078c063b7..e81c28f60 100644
--- a/config/cortex-a15/make_defs.mk
+++ b/config/cortex-a15/make_defs.mk
@@ -97,13 +97,10 @@ endif
 ifeq ($(DEBUG_TYPE),noopt)
 COPTFLAGS      := -O0
 else
-COPTFLAGS      := -march=armv7-a -mfpu=neon -O2
-endif
-
-ifneq ($(DEBUG_TYPE),noopt)
-CVECFLAGS      := #-msse3 -march=native # -mfpmath=sse
+COPTFLAGS      := -O2
 endif
 
+CVECFLAGS      := -march=armv7-a #-msse3 -march=native # -mfpmath=sse
 CKOPTFLAGS     := $(COPTFLAGS)
 
 # Aggregate all of the flags into multiple groups: one for standard
diff --git a/config/cortex-a9/make_defs.mk b/config/cortex-a9/make_defs.mk
index 7dbc0aa77..e81c28f60 100644
--- a/config/cortex-a9/make_defs.mk
+++ b/config/cortex-a9/make_defs.mk
@@ -97,13 +97,10 @@ endif
 ifeq ($(DEBUG_TYPE),noopt)
 COPTFLAGS      := -O0
 else
-COPTFLAGS      := -march=armv7-a -mfpu=neon -O2 -mfloat-abi=hard
-endif
-
-ifneq ($(DEBUG_TYPE),noopt)
-CVECFLAGS      := #-msse3 -march=native # -mfpmath=sse
+COPTFLAGS      := -O2
 endif
 
+CVECFLAGS      := -march=armv7-a #-msse3 -march=native # -mfpmath=sse
 CKOPTFLAGS     := $(COPTFLAGS)
 
 # Aggregate all of the flags into multiple groups: one for standard
diff --git a/config/dunnington/make_defs.mk b/config/dunnington/make_defs.mk
index d065640f5..4d06567d0 100644
--- a/config/dunnington/make_defs.mk
+++ b/config/dunnington/make_defs.mk
@@ -97,13 +97,10 @@ endif
 ifeq ($(DEBUG_TYPE),noopt)
 COPTFLAGS      := -O0
 else
-COPTFLAGS     := -O2 -mfpmath=sse -fomit-frame-pointer
-endif
-
-ifneq ($(DEBUG_TYPE),noopt)
-CVECFLAGS      := -msse3 -march=native
+COPTFLAGS      := -O2 -fomit-frame-pointer
 endif
 
+CVECFLAGS      := -msse3 -march=native -mfpmath=sse
 CKOPTFLAGS     := $(COPTFLAGS)
 
 # Aggregate all of the flags into multiple groups: one for standard
diff --git a/config/haswell/make_defs.mk b/config/haswell/make_defs.mk
index a1865a98b..895746fc5 100644
--- a/config/haswell/make_defs.mk
+++ b/config/haswell/make_defs.mk
@@ -97,13 +97,10 @@ endif
 ifeq ($(DEBUG_TYPE),noopt)
 COPTFLAGS      := -O0
 else
-COPTFLAGS      := -O3 -march=native
-endif
-
-ifneq ($(DEBUG_TYPE),noopt)
-CVECFLAGS      := -mavx2 -mfma -mfpmath=sse #-msse3 -march=native # -mfpmath=sse
+COPTFLAGS      := -O3
 endif
 
+CVECFLAGS      := -mavx2 -mfma -mfpmath=sse -march=native #-msse3 -march=native # -mfpmath=sse
 CKOPTFLAGS     := $(COPTFLAGS)
 
 # Aggregate all of the flags into multiple groups: one for standard
diff --git a/config/loongson3a/make_defs.mk b/config/loongson3a/make_defs.mk
index d3500a7c3..8296dcd92 100644
--- a/config/loongson3a/make_defs.mk
+++ b/config/loongson3a/make_defs.mk
@@ -97,13 +97,10 @@ endif
 ifeq ($(DEBUG_TYPE),noopt)
 COPTFLAGS      := -O0
 else
-COPTFLAGS      := -O3  -march=loongson3a -mtune=loongson3a
-endif
-
-ifneq ($(DEBUG_TYPE),noopt)
-CVECFLAGS      := #-msse3 -march=native # -mfpmath=sse
+COPTFLAGS      := -O3 -mtune=loongson3a
 endif
 
+CVECFLAGS      := -march=loongson3a #-msse3 -march=native # -mfpmath=sse
 CKOPTFLAGS     := $(COPTFLAGS)
 
 # Aggregate all of the flags into multiple groups: one for standard
diff --git a/config/mic/make_defs.mk b/config/mic/make_defs.mk
index 93e1efb9a..5e298269d 100644
--- a/config/mic/make_defs.mk
+++ b/config/mic/make_defs.mk
@@ -100,10 +100,7 @@ else
 COPTFLAGS      := -O3
 endif
 
-ifneq ($(DEBUG_TYPE),noopt)
 CVECFLAGS      := 
-endif
-
 CKOPTFLAGS     := $(COPTFLAGS)
 
 # Aggregate all of the flags into multiple groups: one for standard
diff --git a/config/piledriver/make_defs.mk b/config/piledriver/make_defs.mk
index 89756bea1..08a5baaaf 100644
--- a/config/piledriver/make_defs.mk
+++ b/config/piledriver/make_defs.mk
@@ -97,13 +97,10 @@ endif
 ifeq ($(DEBUG_TYPE),noopt)
 COPTFLAGS      := -O0
 else
-COPTFLAGS      := -O2 -mfpmath=sse -fomit-frame-pointer
-endif
-
-ifneq ($(DEBUG_TYPE),noopt)
-CVECFLAGS      := -mavx -mfma -march=native
+COPTFLAGS      := -O2 -fomit-frame-pointer
 endif
 
+CVECFLAGS      := -mavx -mfma -march=native -mfpmath=sse
 CKOPTFLAGS     := $(COPTFLAGS)
 
 # Aggregate all of the flags into multiple groups: one for standard
diff --git a/config/power7/make_defs.mk b/config/power7/make_defs.mk
index 460f53d07..8beaa15fb 100644
--- a/config/power7/make_defs.mk
+++ b/config/power7/make_defs.mk
@@ -100,10 +100,7 @@ else
 COPTFLAGS      := -O3 -mtune=power7
 endif
 
-ifneq ($(DEBUG_TYPE),noopt)
 CVECFLAGS      := -mvsx
-endif
-
 CKOPTFLAGS     := $(COPTFLAGS)
 
 # Aggregate all of the flags into multiple groups: one for standard
diff --git a/config/reference/make_defs.mk b/config/reference/make_defs.mk
index 509c4d9e5..b0ac0c62f 100644
--- a/config/reference/make_defs.mk
+++ b/config/reference/make_defs.mk
@@ -100,10 +100,7 @@ else
 COPTFLAGS      := -O2
 endif
 
-ifneq ($(DEBUG_TYPE),noopt)
 CVECFLAGS      := #-msse3 -march=native # -mfpmath=sse
-endif
-
 CKOPTFLAGS     := $(COPTFLAGS)
 
 # Aggregate all of the flags into multiple groups: one for standard
diff --git a/config/sandybridge/make_defs.mk b/config/sandybridge/make_defs.mk
index c1fd57176..4b96d93f7 100644
--- a/config/sandybridge/make_defs.mk
+++ b/config/sandybridge/make_defs.mk
@@ -97,13 +97,10 @@ endif
 ifeq ($(DEBUG_TYPE),noopt)
 COPTFLAGS      := -O0
 else
-COPTFLAGS      := -O3 -march=native
-endif
-
-ifneq ($(DEBUG_TYPE),noopt)
-CVECFLAGS      := -mavx -mfpmath=sse #-msse3 -march=native # -mfpmath=sse
+COPTFLAGS      := -O3
 endif
 
+CVECFLAGS      := -mavx -mfpmath=sse -march=native #-msse3 -march=native # -mfpmath=sse
 CKOPTFLAGS     := $(COPTFLAGS)
 
 # Aggregate all of the flags into multiple groups: one for standard
diff --git a/config/template/make_defs.mk b/config/template/make_defs.mk
index 509c4d9e5..b0ac0c62f 100644
--- a/config/template/make_defs.mk
+++ b/config/template/make_defs.mk
@@ -100,10 +100,7 @@ else
 COPTFLAGS      := -O2
 endif
 
-ifneq ($(DEBUG_TYPE),noopt)
 CVECFLAGS      := #-msse3 -march=native # -mfpmath=sse
-endif
-
 CKOPTFLAGS     := $(COPTFLAGS)
 
 # Aggregate all of the flags into multiple groups: one for standard
diff --git a/configure b/configure
index 4528a826e..63303434e 100755
--- a/configure
+++ b/configure
@@ -206,6 +206,7 @@ main()
 						;;
 					enable-debug)
 						debug_flag=1
+						debug_type=noopt
 						;;
 					enable-debug=*)
 						debug_flag=1
@@ -327,7 +328,7 @@ main()
 
 	# Check if the debug flag was specified.
 	if [ -n "${debug_flag}" ]; then
-		if [ ${debug_type} = 'opt' ]; then
+		if [ "x${debug_type}" = "xopt" ]; then
 			echo "${script_name}: enabling debug symbols with optimizations."
 		else
 			debug_type='noopt'

From 9452bdb3afbf2d7f898134a091d7790817e7be9c Mon Sep 17 00:00:00 2001
From: Devin Matthews <dmatthews@utexas.edu>
Date: Fri, 25 Mar 2016 14:59:50 -0500
Subject: [PATCH 05/10] Add options for verbose make output and static/shared
 linking to configure.

---
 build/config.mk.in |  4 ++++
 configure          | 43 ++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 46 insertions(+), 1 deletion(-)

diff --git a/build/config.mk.in b/build/config.mk.in
index db63e517f..2af506740 100644
--- a/build/config.mk.in
+++ b/build/config.mk.in
@@ -57,6 +57,10 @@ CC_VENDOR      := @cc_vendor@
 # may install to a temporary location.
 INSTALL_PREFIX := $(DESTDIR)@install_prefix@
 
+# Variables corresponding to other configure-time options.
+BLIS_ENABLE_VERBOSE_MAKE_OUTPUT := @enable_verbose@
+BLIS_ENABLE_STATIC_BUILD        := @enable_static@
+BLIS_ENABLE_DYNAMIC_BUILD       := @enable_dynamic@
 
 # end of ifndef CONFIG_MK_INCLUDED conditional block
 endif
diff --git a/configure b/configure
index 4528a826e..95466771e 100755
--- a/configure
+++ b/configure
@@ -73,6 +73,21 @@ print_usage()
 	echo "                 kept in the framework, otherwise optimization is"
 	echo "                 turned off."
 	echo " "
+	echo "   --enable-verbose-make, --disable-verbose-make"
+	echo " "
+	echo "                 Enable (disabled by default) verbose compilation"
+	echo "                 output during make."
+	echo " "
+	echo "   --disable-static, --enable-static"
+	echo " "
+	echo "                 Disable (enabled by default) building BLIS as a static"
+	echo "                 library. May be combined with --enable-shared."
+	echo " "
+	echo "   --enable-shared, --disable-static"
+	echo " "
+	echo "                 Enable (disabled by default) building BLIS as a shared"
+	echo "                 library. May be combined with --enable-static."
+	echo " "
 	echo "   -q, --quiet   Suppress informational output. By default, configure"
 	echo "                 is verbose. (NOTE: -q is not yet implemented)"
 	echo " "
@@ -85,7 +100,7 @@ print_usage()
 	echo "   Environment variables may also be specified as command line"
 	echo "   options, e.g.:"
 	echo " "
-	echo "     ./configure CC=gcc sandybridge"
+	echo "     ./configure [options] CC=gcc sandybridge"
 	echo " "
 	echo "   Note that not all compilers are compatible with a given"
 	echo "   configuration."
@@ -166,6 +181,11 @@ main()
 
 	# Option variables.
 	quiet_flag=''
+	
+	# Additional flags.
+	enable_verbose='yes'
+	enable_static='yes'
+	enable_shared='no'
 
 	# The path to the auto-detection script.
 	auto_detect_sh="${build_dirpath}/auto-detect/auto-detect.sh"
@@ -211,6 +231,24 @@ main()
 						debug_flag=1
 						debug_type=${OPTARG#*=}
 						;;
+					enable-verbose-make)
+						enable_verbose='yes'
+						;;
+					disable-verbose-make)
+						enable_verbose='no'
+						;;
+					enable-static)
+						enable_static='yes'
+						;;
+					disable-static)
+						enable_static='no'
+						;;
+					enable-shared)
+						enable_shared='yes'
+						;;
+					disable-shared)
+						enable_shared='no'
+						;;
 					*)
 						print_usage
 						;;
@@ -375,6 +413,9 @@ main()
 		| sed "s/@cc_vendor@/${cc_vendor}/g" \
 		| sed "s/@debug_type@/${debug_type}/g" \
 		| sed "s/@install_prefix@/${install_prefix_esc}/g" \
+		| sed "s/@enable_verbose@/${enable_verbose}/g" \
+		| sed "s/@enable_static@/${enable_static}/g" \
+		| sed "s/@enable_shared@/${enable_shared}/g" \
 		> "${config_mk_out_path}"
 
 

From 76099f20be1b49ac960f7e3c5a8296bbf4e1782d Mon Sep 17 00:00:00 2001
From: Devin Matthews <dmatthews@utexas.edu>
Date: Fri, 25 Mar 2016 17:22:58 -0500
Subject: [PATCH 06/10] Add threading option to configure.

---
 Makefile                        | 71 +++++++++++++++++++++++++++++++
 build/config.mk.in              | 17 +++++---
 config/armv7a/make_defs.mk      | 40 ------------------
 config/armv8a/make_defs.mk      | 44 +------------------
 config/bgq/make_defs.mk         | 41 +-----------------
 config/bulldozer/make_defs.mk   | 42 +-----------------
 config/carrizo/bli_config.h     |  3 --
 config/carrizo/make_defs.mk     | 44 +------------------
 config/cortex-a15/make_defs.mk  | 40 ------------------
 config/cortex-a9/make_defs.mk   | 40 ------------------
 config/dunnington/make_defs.mk  | 42 +-----------------
 config/emscripten/make_defs.mk  | 41 +-----------------
 config/haswell/bli_config.h     |  5 ---
 config/haswell/make_defs.mk     | 44 +------------------
 config/loongson3a/make_defs.mk  | 42 +-----------------
 config/mic/bli_config.h         |  1 -
 config/mic/make_defs.mk         | 44 +------------------
 config/piledriver/bli_config.h  |  3 --
 config/piledriver/make_defs.mk  | 44 +------------------
 config/pnacl/make_defs.mk       | 41 +-----------------
 config/power7/make_defs.mk      | 42 +-----------------
 config/reference/make_defs.mk   | 42 +-----------------
 config/sandybridge/bli_config.h |  6 ---
 config/sandybridge/make_defs.mk | 44 +------------------
 config/template/make_defs.mk    | 42 +-----------------
 configure                       | 75 +++++++++++++++++++++++++++++++--
 26 files changed, 173 insertions(+), 767 deletions(-)

diff --git a/Makefile b/Makefile
index c27ecd8f0..2b11f40a9 100644
--- a/Makefile
+++ b/Makefile
@@ -138,6 +138,28 @@ BASE_LIB_PATH           := ./$(LIB_DIR)/$(CONFIG_NAME)
 
 
 
+#
+# --- Utility program definitions ----------------------------------------------
+#
+
+SH         := /bin/sh
+MV         := mv
+MKDIR      := mkdir -p
+RM_F       := rm -f
+RM_RF      := rm -rf
+SYMLINK    := ln -sf
+FIND       := find
+GREP       := grep
+XARGS      := xargs
+RANLIB     := ranlib
+INSTALL    := install -c
+
+# Used to refresh CHANGELOG.
+GIT        := git
+GIT_LOG    := $(GIT) log --decorate
+
+
+
 #
 # --- Include makefile definitions file ----------------------------------------
 #
@@ -157,6 +179,55 @@ else
 MAKE_DEFS_MK_PRESENT := no
 endif
 
+# Deal with threading flags and aggregate all of the flags into multiple groups:
+# one for standard compilation, and one for each of the supported "special"
+# compilation modes.
+
+ifeq ($(CC_VENDOR),gcc)
+ifeq ($(THREADING_MODEL),auto)
+THREADING_MODEL := omp
+endif
+ifeq ($(THREADING_MODEL),omp)
+CTHREADFLAGS := -fopenmp -DBLIS_ENABLE_OPENMP
+LD_FLAGS     += -fopenmp
+endif
+ifeq ($(THREADING_MODEL),pthreads)
+CTHREADFLAGS := -pthread -DBLIS_ENABLE_PTHREADS
+LD_FLAGS     += -pthread
+endif
+endif
+
+ifeq ($(CC_VENDOR),icc)
+ifeq ($(THREADING_MODEL),auto)
+THREADING_MODEL := omp
+endif
+ifeq ($(THREADING_MODEL),omp)
+CTHREADFLAGS := -openmp -DBLIS_ENABLE_OPENMP
+LD_FLAGS     += -openmp
+endif
+ifeq ($(THREADING_MODEL),pthreads)
+CTHREADFLAGS := -pthread -DBLIS_ENABLE_PTHREADS
+LD_FLAGS     += -pthread
+endif
+endif
+
+ifeq ($(CC_VENDOR),clang)
+ifeq ($(THREADING_MODEL),auto)
+THREADING_MODEL := pthreads
+endif
+ifeq ($(THREADING_MODEL),omp)
+$(error OpenMP is not supported with Clang.)
+endif
+ifeq ($(THREADING_MODEL),pthreads)
+CTHREADFLAGS := -pthread -DBLIS_ENABLE_PTHREADS
+LD_FLAGS     += -pthread
+endif
+endif
+
+CFLAGS_NOOPT   := $(CDBGFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CTHREADFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
+CFLAGS         := $(COPTFLAGS)  $(CVECFLAGS) $(CFLAGS_NOOPT)
+CFLAGS_KERNELS := $(CKOPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT)
+
 
 
 #
diff --git a/build/config.mk.in b/build/config.mk.in
index 2af506740..a043d7aa9 100644
--- a/build/config.mk.in
+++ b/build/config.mk.in
@@ -37,20 +37,23 @@ ifndef CONFIG_MK_INCLUDED
 CONFIG_MK_INCLUDED := yes
 
 # The name of the configuration sub-directory.
-CONFIG_NAME    := @config_name@
+CONFIG_NAME     := @config_name@
 
-# The operating system name, which should be either 'Linux' or 'Darwin'.
-OS_NAME        := $(shell uname -s)
+# The operatin g system name, which should be either 'Linux' or 'Darwin'.
+OS_NAME         := $(shell uname -s)
 
 # The directory path to the top level of the source distribution.
-DIST_PATH      := @dist_path@
+DIST_PATH       := @dist_path@
 
 # The level of debugging info to generate.
-DEBUG_TYPE     := @debug_type@
+DEBUG_TYPE      := @debug_type@
 
 # The C compiler.
-CC             := @CC@
-CC_VENDOR      := @cc_vendor@
+CC              := @CC@
+CC_VENDOR       := @cc_vendor@
+
+# The requested threading model.
+THREADING_MODEL := @threading_model@
 
 # The install prefix tell us where to install the libraries and header file
 # directory. Notice that we support the use of DESTDIR so that advanced users
diff --git a/config/armv7a/make_defs.mk b/config/armv7a/make_defs.mk
index ff1d84b0d..40b6c179a 100644
--- a/config/armv7a/make_defs.mk
+++ b/config/armv7a/make_defs.mk
@@ -38,39 +38,6 @@ MAKE_DEFS_MK_INCLUDED := yes
 
 
 
-#
-# --- Build definitions --------------------------------------------------------
-#
-
-# Variables corresponding to other configure-time options.
-BLIS_ENABLE_VERBOSE_MAKE_OUTPUT := yes
-BLIS_ENABLE_STATIC_BUILD        := yes
-BLIS_ENABLE_DYNAMIC_BUILD       := no
-
-
-
-#
-# --- Utility program definitions ----------------------------------------------
-#
-
-SH         := /bin/sh
-MV         := mv
-MKDIR      := mkdir -p
-RM_F       := rm -f
-RM_RF      := rm -rf
-SYMLINK    := ln -sf
-FIND       := find
-GREP       := grep
-XARGS      := xargs
-RANLIB     := ranlib
-INSTALL    := install -c
-
-# Used to refresh CHANGELOG.
-GIT        := git
-GIT_LOG    := $(GIT) log --decorate
-
-
-
 #
 # --- Development tools definitions --------------------------------------------
 #
@@ -103,13 +70,6 @@ endif
 CVECFLAGS      := -mfpu=vfpv3 -marm -march=armv7-a
 CKOPTFLAGS     := $(COPTFLAGS)
 
-# Aggregate all of the flags into multiple groups: one for standard
-# compilation, and one for each of the supported "special" compilation
-# modes.
-CFLAGS_NOOPT   := $(CDBGFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
-CFLAGS         := $(COPTFLAGS)  $(CVECFLAGS) $(CFLAGS_NOOPT)
-CFLAGS_KERNELS := $(CKOPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT)
-
 # --- Determine the archiver and related flags ---
 AR             := ar
 ARFLAGS        := cru
diff --git a/config/armv8a/make_defs.mk b/config/armv8a/make_defs.mk
index 3cec43c92..654a9ff92 100644
--- a/config/armv8a/make_defs.mk
+++ b/config/armv8a/make_defs.mk
@@ -38,39 +38,6 @@ MAKE_DEFS_MK_INCLUDED := yes
 
 
 
-#
-# --- Build definitions --------------------------------------------------------
-#
-
-# Variables corresponding to other configure-time options.
-BLIS_ENABLE_VERBOSE_MAKE_OUTPUT := yes
-BLIS_ENABLE_STATIC_BUILD        := yes
-BLIS_ENABLE_DYNAMIC_BUILD       := no
-
-
-
-#
-# --- Utility program definitions ----------------------------------------------
-#
-
-SH         := /bin/sh
-MV         := mv
-MKDIR      := mkdir -p
-RM_F       := rm -f
-RM_RF      := rm -rf
-SYMLINK    := ln -sf
-FIND       := find
-GREP       := grep
-XARGS      := xargs
-RANLIB     := ranlib
-INSTALL    := install -c
-
-# Used to refresh CHANGELOG.
-GIT        := git
-GIT_LOG    := $(GIT) log --decorate
-
-
-
 #
 # --- Development tools definitions --------------------------------------------
 #
@@ -86,7 +53,7 @@ endif
 # Enable IEEE Standard 1003.1-2004 (POSIX.1d). 
 # NOTE: This is needed to enable posix_memalign().
 CPPROCFLAGS    := -D_GNU_SOURCE
-CMISCFLAGS     := -std=c99 -fopenmp
+CMISCFLAGS     := -std=c99
 CPICFLAGS      := -fPIC
 CWARNFLAGS     := -Wall
 
@@ -103,13 +70,6 @@ endif
 CVECFLAGS      := -march=armv8-a+fp+simd -mcpu=cortex-a57.cortex-a53
 CKOPTFLAGS     := $(COPTFLAGS)
 
-# Aggregate all of the flags into multiple groups: one for standard
-# compilation, and one for each of the supported "special" compilation
-# modes.
-CFLAGS_NOOPT   := $(CDBGFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
-CFLAGS         := $(COPTFLAGS)  $(CVECFLAGS) $(CFLAGS_NOOPT)
-CFLAGS_KERNELS := $(CKOPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT)
-
 # --- Determine the archiver and related flags ---
 AR             := ar
 ARFLAGS        := cru
@@ -117,7 +77,7 @@ ARFLAGS        := cru
 # --- Determine the linker and related flags ---
 LINKER         := $(CC)
 SOFLAGS        := -shared
-LDFLAGS        := -lm -fopenmp
+LDFLAGS        := -lm
 
 
 
diff --git a/config/bgq/make_defs.mk b/config/bgq/make_defs.mk
index 050f353fb..0f405102b 100644
--- a/config/bgq/make_defs.mk
+++ b/config/bgq/make_defs.mk
@@ -38,45 +38,13 @@ MAKE_DEFS_MK_INCLUDED := yes
 
 
 
-#
-# --- Build definitions --------------------------------------------------------
-#
-
-# Variables corresponding to other configure-time options.
-BLIS_ENABLE_VERBOSE_MAKE_OUTPUT := no
-BLIS_ENABLE_STATIC_BUILD        := yes
-BLIS_ENABLE_DYNAMIC_BUILD       := no
-
-
-
-#
-# --- Utility program definitions ----------------------------------------------
-#
-
-SH         := /bin/sh
-MV         := mv
-MKDIR      := mkdir -p
-RM_F       := rm -f
-RM_RF      := rm -rf
-SYMLINK    := ln -sf
-FIND       := find
-GREP       := grep
-XARGS      := xargs
-RANLIB     := ranlib
-INSTALL    := install -c
-
-# Used to refresh CHANGELOG.
-GIT        := git
-GIT_LOG    := $(GIT) log --decorate
-
-
-
 #
 # --- Development tools definitions --------------------------------------------
 #
 
 # --- Determine the C compiler and related flags ---
 CC             := /bgsys/drivers/ppcfloor/comm/gcc.legacy/bin/mpixlc_r
+CC_VENDOR      := IBM
 # Enable IEEE Standard 1003.1-2004 (POSIX.1d). 
 # NOTE: This is needed to enable posix_memalign().
 CPPROCFLAGS    := -D_POSIX_C_SOURCE=200112L \
@@ -89,13 +57,6 @@ COPTFLAGS      := -O3
 CKOPTFLAGS     := $(COPTFLAGS)
 CVECFLAGS      := -qarch=qp -qtune=qp -qsimd=auto -qhot=level=1 -qprefetch -qunroll=yes -qnoipa
 
-# Aggregate all of the flags into multiple groups: one for standard
-# compilation, and one for each of the supported "special" compilation
-# modes.
-CFLAGS_NOOPT   := $(CDBGFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
-CFLAGS         := $(COPTFLAGS)  $(CVECFLAGS) $(CFLAGS_NOOPT)
-CFLAGS_KERNELS := $(CKOPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT)
-
 # --- Determine the archiver and related flags ---
 AR             := ar
 ARFLAGS        := cru
diff --git a/config/bulldozer/make_defs.mk b/config/bulldozer/make_defs.mk
index 772ac1c53..78f47d908 100644
--- a/config/bulldozer/make_defs.mk
+++ b/config/bulldozer/make_defs.mk
@@ -38,39 +38,6 @@ MAKE_DEFS_MK_INCLUDED := yes
 
 
 
-#
-# --- Build definitions --------------------------------------------------------
-#
-
-# Variables corresponding to other configure-time options.
-BLIS_ENABLE_VERBOSE_MAKE_OUTPUT := no
-BLIS_ENABLE_STATIC_BUILD        := yes
-BLIS_ENABLE_DYNAMIC_BUILD       := no
-
-
-
-#
-# --- Utility program definitions ----------------------------------------------
-#
-
-SH         := /bin/sh
-MV         := mv
-MKDIR      := mkdir -p
-RM_F       := rm -f
-RM_RF      := rm -rf
-SYMLINK    := ln -sf
-FIND       := find
-GREP       := grep
-XARGS      := xargs
-RANLIB     := ranlib
-INSTALL    := install -c
-
-# Used to refresh CHANGELOG.
-GIT        := git
-GIT_LOG    := $(GIT) log --decorate
-
-
-
 #
 # --- Development tools definitions --------------------------------------------
 #
@@ -86,7 +53,7 @@ endif
 # Enable IEEE Standard 1003.1-2004 (POSIX.1d). 
 # NOTE: This is needed to enable posix_memalign().
 CPPROCFLAGS    := -D_POSIX_C_SOURCE=200112L
-CMISCFLAGS     := -std=c99 -fopenmp
+CMISCFLAGS     := -std=c99
 CPICFLAGS      := -fPIC
 CWARNFLAGS     := -Wall
 
@@ -103,13 +70,6 @@ endif
 CVECFLAGS      := -mavx -mfma -march=bdver2 -mfpmath=sse
 CKOPTFLAGS     := $(COPTFLAGS)
 
-# Aggregate all of the flags into multiple groups: one for standard
-# compilation, and one for each of the supported "special" compilation
-# modes.
-CFLAGS_NOOPT   := $(CDBGFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
-CFLAGS         := $(COPTFLAGS)  $(CVECFLAGS) $(CFLAGS_NOOPT)
-CFLAGS_KERNELS := $(CKOPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT)
-
 # --- Determine the archiver and related flags ---
 AR             := ar
 ARFLAGS        := cru
diff --git a/config/carrizo/bli_config.h b/config/carrizo/bli_config.h
index b0da5de46..86a584112 100644
--- a/config/carrizo/bli_config.h
+++ b/config/carrizo/bli_config.h
@@ -36,9 +36,6 @@
 #define BLIS_CONFIG_H
 
 
-//#define BLIS_ENABLE_PTHREADS
-#define BLIS_ENABLE_OPENMP
-
 #define BLIS_SIMD_ALIGN_SIZE             16
 
 
diff --git a/config/carrizo/make_defs.mk b/config/carrizo/make_defs.mk
index 08a5baaaf..4708a8f36 100644
--- a/config/carrizo/make_defs.mk
+++ b/config/carrizo/make_defs.mk
@@ -38,39 +38,6 @@ MAKE_DEFS_MK_INCLUDED := yes
 
 
 
-#
-# --- Build definitions --------------------------------------------------------
-#
-
-# Variables corresponding to other configure-time options.
-BLIS_ENABLE_VERBOSE_MAKE_OUTPUT := no
-BLIS_ENABLE_STATIC_BUILD        := yes
-BLIS_ENABLE_DYNAMIC_BUILD       := no
-
-
-
-#
-# --- Utility program definitions ----------------------------------------------
-#
-
-SH         := /bin/sh
-MV         := mv
-MKDIR      := mkdir -p
-RM_F       := rm -f
-RM_RF      := rm -rf
-SYMLINK    := ln -sf
-FIND       := find
-GREP       := grep
-XARGS      := xargs
-RANLIB     := ranlib
-INSTALL    := install -c
-
-# Used to refresh CHANGELOG.
-GIT        := git
-GIT_LOG    := $(GIT) log --decorate
-
-
-
 #
 # --- Development tools definitions --------------------------------------------
 #
@@ -86,7 +53,7 @@ endif
 # Enable IEEE Standard 1003.1-2004 (POSIX.1d). 
 # NOTE: This is needed to enable posix_memalign().
 CPPROCFLAGS    := -D_POSIX_C_SOURCE=200112L
-CMISCFLAGS     := -std=c99 -fopenmp
+CMISCFLAGS     := -std=c99
 CPICFLAGS      := -fPIC
 CWARNFLAGS     := -Wall
 
@@ -103,13 +70,6 @@ endif
 CVECFLAGS      := -mavx -mfma -march=native -mfpmath=sse
 CKOPTFLAGS     := $(COPTFLAGS)
 
-# Aggregate all of the flags into multiple groups: one for standard
-# compilation, and one for each of the supported "special" compilation
-# modes.
-CFLAGS_NOOPT   := $(CDBGFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
-CFLAGS         := $(COPTFLAGS)  $(CVECFLAGS) $(CFLAGS_NOOPT)
-CFLAGS_KERNELS := $(CKOPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT)
-
 # --- Determine the archiver and related flags ---
 AR             := ar
 ARFLAGS        := cru
@@ -117,7 +77,7 @@ ARFLAGS        := cru
 # --- Determine the linker and related flags ---
 LINKER         := $(CC)
 SOFLAGS        := -shared
-LDFLAGS        := -lm -fopenmp
+LDFLAGS        := -lm
 
 
 
diff --git a/config/cortex-a15/make_defs.mk b/config/cortex-a15/make_defs.mk
index e81c28f60..6f584f14c 100644
--- a/config/cortex-a15/make_defs.mk
+++ b/config/cortex-a15/make_defs.mk
@@ -38,39 +38,6 @@ MAKE_DEFS_MK_INCLUDED := yes
 
 
 
-#
-# --- Build definitions --------------------------------------------------------
-#
-
-# Variables corresponding to other configure-time options.
-BLIS_ENABLE_VERBOSE_MAKE_OUTPUT := no
-BLIS_ENABLE_STATIC_BUILD        := yes
-BLIS_ENABLE_DYNAMIC_BUILD       := no
-
-
-
-#
-# --- Utility program definitions ----------------------------------------------
-#
-
-SH         := /bin/sh
-MV         := mv
-MKDIR      := mkdir -p
-RM_F       := rm -f
-RM_RF      := rm -rf
-SYMLINK    := ln -sf
-FIND       := find
-GREP       := grep
-XARGS      := xargs
-RANLIB     := ranlib
-INSTALL    := install -c
-
-# Used to refresh CHANGELOG.
-GIT        := git
-GIT_LOG    := $(GIT) log --decorate
-
-
-
 #
 # --- Development tools definitions --------------------------------------------
 #
@@ -103,13 +70,6 @@ endif
 CVECFLAGS      := -march=armv7-a #-msse3 -march=native # -mfpmath=sse
 CKOPTFLAGS     := $(COPTFLAGS)
 
-# Aggregate all of the flags into multiple groups: one for standard
-# compilation, and one for each of the supported "special" compilation
-# modes.
-CFLAGS_NOOPT   := $(CDBGFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
-CFLAGS         := $(COPTFLAGS)  $(CVECFLAGS) $(CFLAGS_NOOPT)
-CFLAGS_KERNELS := $(CKOPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT)
-
 # --- Determine the archiver and related flags ---
 AR             := ar
 ARFLAGS        := cru
diff --git a/config/cortex-a9/make_defs.mk b/config/cortex-a9/make_defs.mk
index e81c28f60..6f584f14c 100644
--- a/config/cortex-a9/make_defs.mk
+++ b/config/cortex-a9/make_defs.mk
@@ -38,39 +38,6 @@ MAKE_DEFS_MK_INCLUDED := yes
 
 
 
-#
-# --- Build definitions --------------------------------------------------------
-#
-
-# Variables corresponding to other configure-time options.
-BLIS_ENABLE_VERBOSE_MAKE_OUTPUT := no
-BLIS_ENABLE_STATIC_BUILD        := yes
-BLIS_ENABLE_DYNAMIC_BUILD       := no
-
-
-
-#
-# --- Utility program definitions ----------------------------------------------
-#
-
-SH         := /bin/sh
-MV         := mv
-MKDIR      := mkdir -p
-RM_F       := rm -f
-RM_RF      := rm -rf
-SYMLINK    := ln -sf
-FIND       := find
-GREP       := grep
-XARGS      := xargs
-RANLIB     := ranlib
-INSTALL    := install -c
-
-# Used to refresh CHANGELOG.
-GIT        := git
-GIT_LOG    := $(GIT) log --decorate
-
-
-
 #
 # --- Development tools definitions --------------------------------------------
 #
@@ -103,13 +70,6 @@ endif
 CVECFLAGS      := -march=armv7-a #-msse3 -march=native # -mfpmath=sse
 CKOPTFLAGS     := $(COPTFLAGS)
 
-# Aggregate all of the flags into multiple groups: one for standard
-# compilation, and one for each of the supported "special" compilation
-# modes.
-CFLAGS_NOOPT   := $(CDBGFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
-CFLAGS         := $(COPTFLAGS)  $(CVECFLAGS) $(CFLAGS_NOOPT)
-CFLAGS_KERNELS := $(CKOPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT)
-
 # --- Determine the archiver and related flags ---
 AR             := ar
 ARFLAGS        := cru
diff --git a/config/dunnington/make_defs.mk b/config/dunnington/make_defs.mk
index 4d06567d0..e67d45e85 100644
--- a/config/dunnington/make_defs.mk
+++ b/config/dunnington/make_defs.mk
@@ -38,39 +38,6 @@ MAKE_DEFS_MK_INCLUDED := yes
 
 
 
-#
-# --- Build definitions --------------------------------------------------------
-#
-
-# Variables corresponding to other configure-time options.
-BLIS_ENABLE_VERBOSE_MAKE_OUTPUT := no
-BLIS_ENABLE_STATIC_BUILD        := yes
-BLIS_ENABLE_DYNAMIC_BUILD       := no
-
-
-
-#
-# --- Utility program definitions ----------------------------------------------
-#
-
-SH         := /bin/sh
-MV         := mv
-MKDIR      := mkdir -p
-RM_F       := rm -f
-RM_RF      := rm -rf
-SYMLINK    := ln -sf
-FIND       := find
-GREP       := grep
-XARGS      := xargs
-RANLIB     := ranlib
-INSTALL    := install -c
-
-# Used to refresh CHANGELOG.
-GIT        := git
-GIT_LOG    := $(GIT) log --decorate
-
-
-
 #
 # --- Development tools definitions --------------------------------------------
 #
@@ -86,7 +53,7 @@ endif
 # Enable IEEE Standard 1003.1-2004 (POSIX.1d). 
 # NOTE: This is needed to enable posix_memalign().
 CPPROCFLAGS    := -D_POSIX_C_SOURCE=200112L
-CMISCFLAGS     := -std=c99 # -fopenmp -pg
+CMISCFLAGS     := -std=c99
 CPICFLAGS      := -fPIC
 CWARNFLAGS     := -Wall
 
@@ -103,13 +70,6 @@ endif
 CVECFLAGS      := -msse3 -march=native -mfpmath=sse
 CKOPTFLAGS     := $(COPTFLAGS)
 
-# Aggregate all of the flags into multiple groups: one for standard
-# compilation, and one for each of the supported "special" compilation
-# modes.
-CFLAGS_NOOPT   := $(CDBGFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
-CFLAGS         := $(COPTFLAGS)  $(CVECFLAGS) $(CFLAGS_NOOPT)
-CFLAGS_KERNELS := $(CKOPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT)
-
 # --- Determine the archiver and related flags ---
 AR             := ar
 ARFLAGS        := cru
diff --git a/config/emscripten/make_defs.mk b/config/emscripten/make_defs.mk
index 55107d98a..45b210ab6 100644
--- a/config/emscripten/make_defs.mk
+++ b/config/emscripten/make_defs.mk
@@ -38,45 +38,13 @@ MAKE_DEFS_MK_INCLUDED := yes
 
 
 
-#
-# --- Build definitions --------------------------------------------------------
-#
-
-# Variables corresponding to other configure-time options.
-BLIS_ENABLE_VERBOSE_MAKE_OUTPUT := no
-BLIS_ENABLE_STATIC_BUILD        := yes
-BLIS_ENABLE_DYNAMIC_BUILD       := no
-
-
-
-#
-# --- Utility program definitions ----------------------------------------------
-#
-
-SH         := /bin/sh
-MV         := mv
-MKDIR      := mkdir -p
-RM_F       := rm -f
-RM_RF      := rm -rf
-SYMLINK    := ln -sf
-FIND       := find
-GREP       := grep
-XARGS      := xargs
-RANLIB     := emranlib
-INSTALL    := install -c
-
-# Used to refresh CHANGELOG.
-GIT        := git
-GIT_LOG    := $(GIT) log --decorate
-
-
-
 #
 # --- Development tools definitions --------------------------------------------
 #
 
 # --- Determine the C compiler and related flags ---
 CC             := emcc
+CC_VENDOR      := emcc
 # Enable IEEE Standard 1003.1-2004 (POSIX.1d). 
 # NOTE: This is needed to enable posix_memalign().
 CPPROCFLAGS    := -D_POSIX_C_SOURCE=200112L
@@ -88,13 +56,6 @@ COPTFLAGS      := -O2
 CKOPTFLAGS     := -O3
 CVECFLAGS      :=
 
-# Aggregate all of the flags into multiple groups: one for standard
-# compilation, and one for each of the supported "special" compilation
-# modes.
-CFLAGS_NOOPT   := $(CDBGFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
-CFLAGS         := $(COPTFLAGS)  $(CVECFLAGS) $(CFLAGS_NOOPT)
-CFLAGS_KERNELS := $(CKOPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT)
-
 # --- Determine the archiver and related flags ---
 AR             := emar
 ARFLAGS        := cru
diff --git a/config/haswell/bli_config.h b/config/haswell/bli_config.h
index 5f66f6dae..89bba2b20 100644
--- a/config/haswell/bli_config.h
+++ b/config/haswell/bli_config.h
@@ -35,11 +35,6 @@
 #ifndef BLIS_CONFIG_H
 #define BLIS_CONFIG_H
 
-// Enable multithreading via POSIX threads.
-//#define BLIS_ENABLE_PTHREADS
-
-// Enable multithreading via OpenMP.
-#define BLIS_ENABLE_OPENMP
 
 
 
diff --git a/config/haswell/make_defs.mk b/config/haswell/make_defs.mk
index 895746fc5..cbc11f37a 100644
--- a/config/haswell/make_defs.mk
+++ b/config/haswell/make_defs.mk
@@ -38,39 +38,6 @@ MAKE_DEFS_MK_INCLUDED := yes
 
 
 
-#
-# --- Build definitions --------------------------------------------------------
-#
-
-# Variables corresponding to other configure-time options.
-BLIS_ENABLE_VERBOSE_MAKE_OUTPUT := no
-BLIS_ENABLE_STATIC_BUILD        := yes
-BLIS_ENABLE_DYNAMIC_BUILD       := no
-
-
-
-#
-# --- Utility program definitions ----------------------------------------------
-#
-
-SH         := /bin/sh
-MV         := mv
-MKDIR      := mkdir -p
-RM_F       := rm -f
-RM_RF      := rm -rf
-SYMLINK    := ln -sf
-FIND       := find
-GREP       := grep
-XARGS      := xargs
-RANLIB     := ranlib
-INSTALL    := install -c
-
-# Used to refresh CHANGELOG.
-GIT        := git
-GIT_LOG    := $(GIT) log --decorate
-
-
-
 #
 # --- Development tools definitions --------------------------------------------
 #
@@ -86,7 +53,7 @@ endif
 # Enable IEEE Standard 1003.1-2004 (POSIX.1d). 
 # NOTE: This is needed to enable posix_memalign().
 CPPROCFLAGS    := -D_POSIX_C_SOURCE=200112L
-CMISCFLAGS     := -std=c99 -m64 -fopenmp  # -fopenmp -pg
+CMISCFLAGS     := -std=c99 -m64
 CPICFLAGS      := -fPIC
 CWARNFLAGS     := -Wall
 
@@ -103,13 +70,6 @@ endif
 CVECFLAGS      := -mavx2 -mfma -mfpmath=sse -march=native #-msse3 -march=native # -mfpmath=sse
 CKOPTFLAGS     := $(COPTFLAGS)
 
-# Aggregate all of the flags into multiple groups: one for standard
-# compilation, and one for each of the supported "special" compilation
-# modes.
-CFLAGS_NOOPT   := $(CDBGFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
-CFLAGS         := $(COPTFLAGS)  $(CVECFLAGS) $(CFLAGS_NOOPT)
-CFLAGS_KERNELS := $(CKOPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT)
-
 # --- Determine the archiver and related flags ---
 AR             := ar
 ARFLAGS        := cru
@@ -117,7 +77,7 @@ ARFLAGS        := cru
 # --- Determine the linker and related flags ---
 LINKER         := $(CC)
 SOFLAGS        := -shared
-LDFLAGS        := -lm -fopenmp -lpthread
+LDFLAGS        := -lm
 
 
 
diff --git a/config/loongson3a/make_defs.mk b/config/loongson3a/make_defs.mk
index 8296dcd92..8bb13192c 100644
--- a/config/loongson3a/make_defs.mk
+++ b/config/loongson3a/make_defs.mk
@@ -38,39 +38,6 @@ MAKE_DEFS_MK_INCLUDED := yes
 
 
 
-#
-# --- Build definitions --------------------------------------------------------
-#
-
-# Variables corresponding to other configure-time options.
-BLIS_ENABLE_VERBOSE_MAKE_OUTPUT := no
-BLIS_ENABLE_STATIC_BUILD        := yes
-BLIS_ENABLE_DYNAMIC_BUILD       := no
-
-
-
-#
-# --- Utility program definitions ----------------------------------------------
-#
-
-SH         := /bin/sh
-MV         := mv
-MKDIR      := mkdir -p
-RM_F       := rm -f
-RM_RF      := rm -rf
-SYMLINK    := ln -sf
-FIND       := find
-GREP       := grep
-XARGS      := xargs
-RANLIB     := ranlib
-INSTALL    := install -c
-
-# Used to refresh CHANGELOG.
-GIT        := git
-GIT_LOG    := $(GIT) log --decorate
-
-
-
 #
 # --- Development tools definitions --------------------------------------------
 #
@@ -86,7 +53,7 @@ endif
 # Enable IEEE Standard 1003.1-2004 (POSIX.1d). 
 # NOTE: This is needed to enable posix_memalign().
 CPPROCFLAGS    := -D_POSIX_C_SOURCE=200112L -mabi=64
-CMISCFLAGS     := -std=c99 -fopenmp #-pg
+CMISCFLAGS     := -std=c99
 CPICFLAGS      := -fPIC
 CWARNFLAGS     := -Wall
 
@@ -103,13 +70,6 @@ endif
 CVECFLAGS      := -march=loongson3a #-msse3 -march=native # -mfpmath=sse
 CKOPTFLAGS     := $(COPTFLAGS)
 
-# Aggregate all of the flags into multiple groups: one for standard
-# compilation, and one for each of the supported "special" compilation
-# modes.
-CFLAGS_NOOPT   := $(CDBGFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
-CFLAGS         := $(COPTFLAGS)  $(CVECFLAGS) $(CFLAGS_NOOPT)
-CFLAGS_KERNELS := $(CKOPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT)
-
 # --- Determine the archiver and related flags ---
 AR             := ar
 ARFLAGS        := cru
diff --git a/config/mic/bli_config.h b/config/mic/bli_config.h
index a119a2dde..36b14cf4c 100644
--- a/config/mic/bli_config.h
+++ b/config/mic/bli_config.h
@@ -39,7 +39,6 @@
 #define BLIS_TREE_BARRIER
 #define BLIS_TREE_BARRIER_ARITY 4
 
-#define BLIS_ENABLE_OPENMP
 
 #define BLIS_SIMD_ALIGN_SIZE             32
 
diff --git a/config/mic/make_defs.mk b/config/mic/make_defs.mk
index 5e298269d..21af9e2e2 100644
--- a/config/mic/make_defs.mk
+++ b/config/mic/make_defs.mk
@@ -38,39 +38,6 @@ MAKE_DEFS_MK_INCLUDED := yes
 
 
 
-#
-# --- Build definitions --------------------------------------------------------
-#
-
-# Variables corresponding to other configure-time options.
-BLIS_ENABLE_VERBOSE_MAKE_OUTPUT := no
-BLIS_ENABLE_STATIC_BUILD        := yes
-BLIS_ENABLE_DYNAMIC_BUILD       := no
-
-
-
-#
-# --- Utility program definitions ----------------------------------------------
-#
-
-SH         := /bin/sh
-MV         := mv
-MKDIR      := mkdir -p
-RM_F       := rm -f
-RM_RF      := rm -rf
-SYMLINK    := ln -sf
-FIND       := find
-GREP       := grep
-XARGS      := xargs
-RANLIB     := ranlib
-INSTALL    := install -c
-
-# Used to refresh CHANGELOG.
-GIT        := git
-GIT_LOG    := $(GIT) log --decorate
-
-
-
 #
 # --- Development tools definitions --------------------------------------------
 #
@@ -86,7 +53,7 @@ endif
 # Enable IEEE Standard 1003.1-2004 (POSIX.1d). 
 # NOTE: This is needed to enable posix_memalign().
 CPPROCFLAGS    := -D_POSIX_C_SOURCE=200112L
-CMISCFLAGS     := -mmic -fasm-blocks -std=c99 -openmp
+CMISCFLAGS     := -mmic -fasm-blocks -std=c99
 CPICFLAGS      := -fPIC
 CWARNFLAGS     := -Wall
 
@@ -103,13 +70,6 @@ endif
 CVECFLAGS      := 
 CKOPTFLAGS     := $(COPTFLAGS)
 
-# Aggregate all of the flags into multiple groups: one for standard
-# compilation, and one for each of the supported "special" compilation
-# modes.
-CFLAGS_NOOPT   := $(CDBGFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
-CFLAGS         := $(COPTFLAGS)  $(CVECFLAGS) $(CFLAGS_NOOPT)
-CFLAGS_KERNELS := $(CKOPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT)
-
 # --- Determine the archiver and related flags ---
 AR             := ar
 ARFLAGS        := cru
@@ -117,7 +77,7 @@ ARFLAGS        := cru
 # --- Determine the linker and related flags ---
 LINKER         := $(CC)
 SOFLAGS        := -shared
-LDFLAGS        := -mmic -lm -openmp
+LDFLAGS        := -mmic -lm
 
 
 
diff --git a/config/piledriver/bli_config.h b/config/piledriver/bli_config.h
index dce91516d..38708a0b2 100644
--- a/config/piledriver/bli_config.h
+++ b/config/piledriver/bli_config.h
@@ -36,9 +36,6 @@
 #define BLIS_CONFIG_H
 
 
-//#define BLIS_ENABLE_PTHREADS
-
-#define BLIS_ENABLE_OPENMP
 
 #define BLIS_SIMD_ALIGN_SIZE 16
 
diff --git a/config/piledriver/make_defs.mk b/config/piledriver/make_defs.mk
index 08a5baaaf..4708a8f36 100644
--- a/config/piledriver/make_defs.mk
+++ b/config/piledriver/make_defs.mk
@@ -38,39 +38,6 @@ MAKE_DEFS_MK_INCLUDED := yes
 
 
 
-#
-# --- Build definitions --------------------------------------------------------
-#
-
-# Variables corresponding to other configure-time options.
-BLIS_ENABLE_VERBOSE_MAKE_OUTPUT := no
-BLIS_ENABLE_STATIC_BUILD        := yes
-BLIS_ENABLE_DYNAMIC_BUILD       := no
-
-
-
-#
-# --- Utility program definitions ----------------------------------------------
-#
-
-SH         := /bin/sh
-MV         := mv
-MKDIR      := mkdir -p
-RM_F       := rm -f
-RM_RF      := rm -rf
-SYMLINK    := ln -sf
-FIND       := find
-GREP       := grep
-XARGS      := xargs
-RANLIB     := ranlib
-INSTALL    := install -c
-
-# Used to refresh CHANGELOG.
-GIT        := git
-GIT_LOG    := $(GIT) log --decorate
-
-
-
 #
 # --- Development tools definitions --------------------------------------------
 #
@@ -86,7 +53,7 @@ endif
 # Enable IEEE Standard 1003.1-2004 (POSIX.1d). 
 # NOTE: This is needed to enable posix_memalign().
 CPPROCFLAGS    := -D_POSIX_C_SOURCE=200112L
-CMISCFLAGS     := -std=c99 -fopenmp
+CMISCFLAGS     := -std=c99
 CPICFLAGS      := -fPIC
 CWARNFLAGS     := -Wall
 
@@ -103,13 +70,6 @@ endif
 CVECFLAGS      := -mavx -mfma -march=native -mfpmath=sse
 CKOPTFLAGS     := $(COPTFLAGS)
 
-# Aggregate all of the flags into multiple groups: one for standard
-# compilation, and one for each of the supported "special" compilation
-# modes.
-CFLAGS_NOOPT   := $(CDBGFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
-CFLAGS         := $(COPTFLAGS)  $(CVECFLAGS) $(CFLAGS_NOOPT)
-CFLAGS_KERNELS := $(CKOPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT)
-
 # --- Determine the archiver and related flags ---
 AR             := ar
 ARFLAGS        := cru
@@ -117,7 +77,7 @@ ARFLAGS        := cru
 # --- Determine the linker and related flags ---
 LINKER         := $(CC)
 SOFLAGS        := -shared
-LDFLAGS        := -lm -fopenmp
+LDFLAGS        := -lm
 
 
 
diff --git a/config/pnacl/make_defs.mk b/config/pnacl/make_defs.mk
index e1fa986be..e957cf429 100644
--- a/config/pnacl/make_defs.mk
+++ b/config/pnacl/make_defs.mk
@@ -38,45 +38,13 @@ MAKE_DEFS_MK_INCLUDED := yes
 
 
 
-#
-# --- Build definitions --------------------------------------------------------
-#
-
-# Variables corresponding to other configure-time options.
-BLIS_ENABLE_VERBOSE_MAKE_OUTPUT := no
-BLIS_ENABLE_STATIC_BUILD        := yes
-BLIS_ENABLE_DYNAMIC_BUILD       := no
-
-
-
-#
-# --- Utility program definitions ----------------------------------------------
-#
-
-SH         := /bin/sh
-MV         := mv
-MKDIR      := mkdir -p
-RM_F       := rm -f
-RM_RF      := rm -rf
-SYMLINK    := ln -sf
-FIND       := find
-GREP       := grep
-XARGS      := xargs
-RANLIB     := pnacl-ranlib
-INSTALL    := install -c
-
-# Used to refresh CHANGELOG.
-GIT        := git
-GIT_LOG    := $(GIT) log --decorate
-
-
-
 #
 # --- Development tools definitions --------------------------------------------
 #
 
 # --- Determine the C compiler and related flags ---
 CC             := pnacl-clang
+CC_VENDOR      := pnacl-clang
 # Enable IEEE Standard 1003.1-2004 (POSIX.1d). 
 # NOTE: This is needed to enable posix_memalign().
 CPPROCFLAGS    := -D_POSIX_C_SOURCE=200112L
@@ -88,13 +56,6 @@ COPTFLAGS      := -O3
 CKOPTFLAGS     := $(COPTFLAGS) -ffast-math
 CVECFLAGS      :=
 
-# Aggregate all of the flags into multiple groups: one for standard
-# compilation, and one for each of the supported "special" compilation
-# modes.
-CFLAGS_NOOPT   := $(CDBGFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
-CFLAGS         := $(COPTFLAGS)  $(CVECFLAGS) $(CFLAGS_NOOPT)
-CFLAGS_KERNELS := $(CKOPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT)
-
 # --- Determine the archiver and related flags ---
 AR             := pnacl-ar
 ARFLAGS        := rcs
diff --git a/config/power7/make_defs.mk b/config/power7/make_defs.mk
index 8beaa15fb..d03857a44 100644
--- a/config/power7/make_defs.mk
+++ b/config/power7/make_defs.mk
@@ -38,39 +38,6 @@ MAKE_DEFS_MK_INCLUDED := yes
 
 
 
-#
-# --- Build definitions --------------------------------------------------------
-#
-
-# Variables corresponding to other configure-time options.
-BLIS_ENABLE_VERBOSE_MAKE_OUTPUT := no
-BLIS_ENABLE_STATIC_BUILD        := yes
-BLIS_ENABLE_DYNAMIC_BUILD       := no
-
-
-
-#
-# --- Utility program definitions ----------------------------------------------
-#
-
-SH         := /bin/sh
-MV         := mv
-MKDIR      := mkdir -p
-RM_F       := rm -f
-RM_RF      := rm -rf
-SYMLINK    := ln -sf
-FIND       := find
-GREP       := grep
-XARGS      := xargs
-RANLIB     := ranlib
-INSTALL    := install -c
-
-# Used to refresh CHANGELOG.
-GIT        := git
-GIT_LOG    := $(GIT) log --decorate
-
-
-
 #
 # --- Development tools definitions --------------------------------------------
 #
@@ -86,7 +53,7 @@ endif
 # Enable IEEE Standard 1003.1-2004 (POSIX.1d). 
 # NOTE: This is needed to enable posix_memalign().
 CPPROCFLAGS    := -D_POSIX_C_SOURCE=200112L
-CMISCFLAGS     := -std=c99 -m64 -mcpu=power7 #-fopenmp -pg
+CMISCFLAGS     := -std=c99 -m64 -mcpu=power7
 CPICFLAGS      := -fPIC
 CWARNFLAGS     := -Wall
 
@@ -103,13 +70,6 @@ endif
 CVECFLAGS      := -mvsx
 CKOPTFLAGS     := $(COPTFLAGS)
 
-# Aggregate all of the flags into multiple groups: one for standard
-# compilation, and one for each of the supported "special" compilation
-# modes.
-CFLAGS_NOOPT   := $(CDBGFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
-CFLAGS         := $(COPTFLAGS)  $(CVECFLAGS) $(CFLAGS_NOOPT)
-CFLAGS_KERNELS := $(CKOPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT)
-
 # --- Determine the archiver and related flags ---
 AR             := ar
 ARFLAGS        := cru
diff --git a/config/reference/make_defs.mk b/config/reference/make_defs.mk
index b0ac0c62f..b17e3a0ba 100644
--- a/config/reference/make_defs.mk
+++ b/config/reference/make_defs.mk
@@ -38,39 +38,6 @@ MAKE_DEFS_MK_INCLUDED := yes
 
 
 
-#
-# --- Build definitions --------------------------------------------------------
-#
-
-# Variables corresponding to other configure-time options.
-BLIS_ENABLE_VERBOSE_MAKE_OUTPUT := no
-BLIS_ENABLE_STATIC_BUILD        := yes
-BLIS_ENABLE_DYNAMIC_BUILD       := no
-
-
-
-#
-# --- Utility program definitions ----------------------------------------------
-#
-
-SH         := /bin/sh
-MV         := mv
-MKDIR      := mkdir -p
-RM_F       := rm -f
-RM_RF      := rm -rf
-SYMLINK    := ln -sf
-FIND       := find
-GREP       := grep
-XARGS      := xargs
-RANLIB     := ranlib
-INSTALL    := install -c
-
-# Used to refresh CHANGELOG.
-GIT        := git
-GIT_LOG    := $(GIT) log --decorate
-
-
-
 #
 # --- Development tools definitions --------------------------------------------
 #
@@ -86,7 +53,7 @@ endif
 # Enable IEEE Standard 1003.1-2004 (POSIX.1d). 
 # NOTE: This is needed to enable posix_memalign().
 CPPROCFLAGS    := -D_POSIX_C_SOURCE=200112L
-CMISCFLAGS     := -std=c99 # -fopenmp -pg
+CMISCFLAGS     := -std=c99
 CPICFLAGS      := -fPIC
 CWARNFLAGS     := -Wall
 
@@ -103,13 +70,6 @@ endif
 CVECFLAGS      := #-msse3 -march=native # -mfpmath=sse
 CKOPTFLAGS     := $(COPTFLAGS)
 
-# Aggregate all of the flags into multiple groups: one for standard
-# compilation, and one for each of the supported "special" compilation
-# modes.
-CFLAGS_NOOPT   := $(CDBGFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
-CFLAGS         := $(COPTFLAGS)  $(CVECFLAGS) $(CFLAGS_NOOPT)
-CFLAGS_KERNELS := $(CKOPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT)
-
 # --- Determine the archiver and related flags ---
 AR             := ar
 ARFLAGS        := cru
diff --git a/config/sandybridge/bli_config.h b/config/sandybridge/bli_config.h
index 5f66f6dae..5b915c737 100644
--- a/config/sandybridge/bli_config.h
+++ b/config/sandybridge/bli_config.h
@@ -35,12 +35,6 @@
 #ifndef BLIS_CONFIG_H
 #define BLIS_CONFIG_H
 
-// Enable multithreading via POSIX threads.
-//#define BLIS_ENABLE_PTHREADS
-
-// Enable multithreading via OpenMP.
-#define BLIS_ENABLE_OPENMP
-
 
 
 #endif
diff --git a/config/sandybridge/make_defs.mk b/config/sandybridge/make_defs.mk
index 4b96d93f7..c69387c7b 100644
--- a/config/sandybridge/make_defs.mk
+++ b/config/sandybridge/make_defs.mk
@@ -38,39 +38,6 @@ MAKE_DEFS_MK_INCLUDED := yes
 
 
 
-#
-# --- Build definitions --------------------------------------------------------
-#
-
-# Variables corresponding to other configure-time options.
-BLIS_ENABLE_VERBOSE_MAKE_OUTPUT := no
-BLIS_ENABLE_STATIC_BUILD        := yes
-BLIS_ENABLE_DYNAMIC_BUILD       := no
-
-
-
-#
-# --- Utility program definitions ----------------------------------------------
-#
-
-SH         := /bin/sh
-MV         := mv
-MKDIR      := mkdir -p
-RM_F       := rm -f
-RM_RF      := rm -rf
-SYMLINK    := ln -sf
-FIND       := find
-GREP       := grep
-XARGS      := xargs
-RANLIB     := ranlib
-INSTALL    := install -c
-
-# Used to refresh CHANGELOG.
-GIT        := git
-GIT_LOG    := $(GIT) log --decorate
-
-
-
 #
 # --- Development tools definitions --------------------------------------------
 #
@@ -86,7 +53,7 @@ endif
 # Enable IEEE Standard 1003.1-2004 (POSIX.1d). 
 # NOTE: This is needed to enable posix_memalign().
 CPPROCFLAGS    := -D_POSIX_C_SOURCE=200112L
-CMISCFLAGS     := -std=c99 -m64 -fopenmp  # -fopenmp -pg
+CMISCFLAGS     := -std=c99 -m64
 CPICFLAGS      := -fPIC
 CWARNFLAGS     := -Wall
 
@@ -103,13 +70,6 @@ endif
 CVECFLAGS      := -mavx -mfpmath=sse -march=native #-msse3 -march=native # -mfpmath=sse
 CKOPTFLAGS     := $(COPTFLAGS)
 
-# Aggregate all of the flags into multiple groups: one for standard
-# compilation, and one for each of the supported "special" compilation
-# modes.
-CFLAGS_NOOPT   := $(CDBGFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
-CFLAGS         := $(COPTFLAGS)  $(CVECFLAGS) $(CFLAGS_NOOPT)
-CFLAGS_KERNELS := $(CKOPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT)
-
 # --- Determine the archiver and related flags ---
 AR             := ar
 ARFLAGS        := cru
@@ -117,7 +77,7 @@ ARFLAGS        := cru
 # --- Determine the linker and related flags ---
 LINKER         := $(CC)
 SOFLAGS        := -shared
-LDFLAGS        := -lm -fopenmp -lpthread
+LDFLAGS        := -lm
 
 
 
diff --git a/config/template/make_defs.mk b/config/template/make_defs.mk
index b0ac0c62f..b17e3a0ba 100644
--- a/config/template/make_defs.mk
+++ b/config/template/make_defs.mk
@@ -38,39 +38,6 @@ MAKE_DEFS_MK_INCLUDED := yes
 
 
 
-#
-# --- Build definitions --------------------------------------------------------
-#
-
-# Variables corresponding to other configure-time options.
-BLIS_ENABLE_VERBOSE_MAKE_OUTPUT := no
-BLIS_ENABLE_STATIC_BUILD        := yes
-BLIS_ENABLE_DYNAMIC_BUILD       := no
-
-
-
-#
-# --- Utility program definitions ----------------------------------------------
-#
-
-SH         := /bin/sh
-MV         := mv
-MKDIR      := mkdir -p
-RM_F       := rm -f
-RM_RF      := rm -rf
-SYMLINK    := ln -sf
-FIND       := find
-GREP       := grep
-XARGS      := xargs
-RANLIB     := ranlib
-INSTALL    := install -c
-
-# Used to refresh CHANGELOG.
-GIT        := git
-GIT_LOG    := $(GIT) log --decorate
-
-
-
 #
 # --- Development tools definitions --------------------------------------------
 #
@@ -86,7 +53,7 @@ endif
 # Enable IEEE Standard 1003.1-2004 (POSIX.1d). 
 # NOTE: This is needed to enable posix_memalign().
 CPPROCFLAGS    := -D_POSIX_C_SOURCE=200112L
-CMISCFLAGS     := -std=c99 # -fopenmp -pg
+CMISCFLAGS     := -std=c99
 CPICFLAGS      := -fPIC
 CWARNFLAGS     := -Wall
 
@@ -103,13 +70,6 @@ endif
 CVECFLAGS      := #-msse3 -march=native # -mfpmath=sse
 CKOPTFLAGS     := $(COPTFLAGS)
 
-# Aggregate all of the flags into multiple groups: one for standard
-# compilation, and one for each of the supported "special" compilation
-# modes.
-CFLAGS_NOOPT   := $(CDBGFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
-CFLAGS         := $(COPTFLAGS)  $(CVECFLAGS) $(CFLAGS_NOOPT)
-CFLAGS_KERNELS := $(CKOPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT)
-
 # --- Determine the archiver and related flags ---
 AR             := ar
 ARFLAGS        := cru
diff --git a/configure b/configure
index 5eab11149..325120c32 100755
--- a/configure
+++ b/configure
@@ -88,6 +88,14 @@ print_usage()
 	echo "                 Enable (disabled by default) building BLIS as a shared"
 	echo "                 library. May be combined with --enable-static."
 	echo " "
+	echo "   -t MODEL, --enable-threading[=MODEL], --disable-threading"
+	echo " "
+	echo "                 Enable threading in the library, using threading model"
+	echo "                 MODEL={auto,omp,pthreads,no}. If MODEL=no or "
+	echo "                 --disable-threading is specified, threading will be"
+	echo "                 disabled. If MODEL=auto or is unspecified, a model"
+	echo "                 will be chosen automatically. The default is 'auto'."
+	echo " "
 	echo "   -q, --quiet   Suppress informational output. By default, configure"
 	echo "                 is verbose. (NOTE: -q is not yet implemented)"
 	echo " "
@@ -179,11 +187,14 @@ main()
 	debug_type=''
 	debug_flag=''
 
+	# The threading flag.
+	threading_model='auto'
+
 	# Option variables.
 	quiet_flag=''
 	
 	# Additional flags.
-	enable_verbose='yes'
+	enable_verbose='no'
 	enable_static='yes'
 	enable_shared='no'
 
@@ -210,7 +221,7 @@ main()
 
 
 	# Process our command line options.
-	while getopts ":hp:d:q-:" opt; do
+	while getopts ":hp:d:t:q-:" opt; do
 		case $opt in
 			-)
 				case "$OPTARG" in
@@ -232,6 +243,9 @@ main()
 						debug_flag=1
 						debug_type=${OPTARG#*=}
 						;;
+					disable-debug)
+						debug_flag=0
+						;;
 					enable-verbose-make)
 						enable_verbose='yes'
 						;;
@@ -250,6 +264,15 @@ main()
 					disable-shared)
 						enable_shared='no'
 						;;
+					enable-threading)
+						threading_model='auto'
+						;;
+					enable-threading=*)
+						threading_model=${OPTARG#*=}
+						;;
+					disable-threading)
+						threading_model='no'
+						;;
 					*)
 						print_usage
 						;;
@@ -268,6 +291,9 @@ main()
 			q)
 				quiet_flag=1
 				;;
+			t)
+				threading_model=$OPTARG
+				;;
 			\?)
 				print_usage
 				;;
@@ -376,6 +402,46 @@ main()
 		debug_type='off'
 		echo "${script_name}: debug symbols disabled."
 	fi
+
+
+	# Check if the verbose make flag was specified.
+	if [ "x${enable_verbose}" = "xyes" ]; then
+		echo "${script_name}: enabling verbose make output, disable with 'make V=0'."
+	else
+		echo "${script_name}: disabling verbose make output, enable with 'make V=1'."
+	fi
+
+
+	# Check if the static lib flag was specified.
+	if [ "x${enable_static}" = "xyes" ]; then
+		echo "${script_name}: building BLIS as a static library."
+	fi
+	
+	# Check if the shared lib flag was specified.
+	if [ "x${enable_shared}" = "xyes" ]; then
+		echo "${script_name}: building BLIS as a shared library."
+	fi
+	
+	# Check if neither flag was specified.
+	if [ "x${enable_static}" = "xno" -a "x${enable_shared}" = "xno" ]; then
+		echo "Neither a shared nor static library build has been requested."
+		exit 1
+	fi
+	
+	
+	# Check the threading model flag.
+	if [ "x${threading_model}" = "xauto" ]; then
+		echo "${script_name}: determining the threading model automatically."
+	elif [ "x${threading_model}" = "xomp" ]; then
+		echo "${script_name}: using OpenMP for threading."
+	elif [ "x${threading_model}" = "xpthreads" ]; then
+		echo "${script_name}: using Pthreads for threading."
+	elif [ "x${threading_model}" = "xno" ]; then
+		echo "${script_name}: threading is disabled."
+	else
+		echo "Unsupported threading model: ${threading_model}."
+		exit 1
+	fi
 	
 	
 	# Determine the compiler vendor if CC was specified.
@@ -389,7 +455,7 @@ main()
 			cc_vendor=`$CC -qversion 2>/dev/null | grep -o 'IBM'`
 		fi
 		if [ -z "$cc_vendor" ]; then
-			echo Unable to determine compiler vendor.
+			echo "Unable to determine compiler vendor."
 			exit 1
 		fi
 		cc_vendor=`echo $cc_vendor | { read first rest; echo $first; }`
@@ -416,7 +482,8 @@ main()
 		| sed "s/@install_prefix@/${install_prefix_esc}/g" \
 		| sed "s/@enable_verbose@/${enable_verbose}/g" \
 		| sed "s/@enable_static@/${enable_static}/g" \
-		| sed "s/@enable_shared@/${enable_shared}/g" \
+		| sed "s/@enable_dynamic@/${enable_shared}/g" \
+		| sed "s/@threading_model@/${threading_model}/g" \
 		> "${config_mk_out_path}"
 
 

From 8442d65c9ead0376fc5f2dfad62fd4862ab9b2b3 Mon Sep 17 00:00:00 2001
From: Devin Matthews <dmatthews@utexas.edu>
Date: Fri, 25 Mar 2016 20:06:48 -0500
Subject: [PATCH 07/10] Replace -march=native with specific architecture flags
 to support cross-compiling, and add icc support for Intel architectures.

---
 config/carrizo/make_defs.mk     |  2 +-
 config/cortex-a15/make_defs.mk  |  2 +-
 config/cortex-a9/make_defs.mk   |  2 +-
 config/dunnington/make_defs.mk  | 15 +++++++++++----
 config/haswell/make_defs.mk     | 15 +++++++++++----
 config/loongson3a/make_defs.mk  |  2 +-
 config/piledriver/make_defs.mk  |  2 +-
 config/reference/make_defs.mk   |  2 +-
 config/sandybridge/make_defs.mk | 15 +++++++++++----
 config/template/make_defs.mk    |  2 +-
 10 files changed, 40 insertions(+), 19 deletions(-)

diff --git a/config/carrizo/make_defs.mk b/config/carrizo/make_defs.mk
index 4708a8f36..aaecb2d2c 100644
--- a/config/carrizo/make_defs.mk
+++ b/config/carrizo/make_defs.mk
@@ -67,7 +67,7 @@ else
 COPTFLAGS      := -O2 -fomit-frame-pointer
 endif
 
-CVECFLAGS      := -mavx -mfma -march=native -mfpmath=sse
+CVECFLAGS      := -mavx -mfma -march=bdver4 -mfpmath=sse
 CKOPTFLAGS     := $(COPTFLAGS)
 
 # --- Determine the archiver and related flags ---
diff --git a/config/cortex-a15/make_defs.mk b/config/cortex-a15/make_defs.mk
index 6f584f14c..ec5360da4 100644
--- a/config/cortex-a15/make_defs.mk
+++ b/config/cortex-a15/make_defs.mk
@@ -67,7 +67,7 @@ else
 COPTFLAGS      := -O2
 endif
 
-CVECFLAGS      := -march=armv7-a #-msse3 -march=native # -mfpmath=sse
+CVECFLAGS      := -march=armv7-a
 CKOPTFLAGS     := $(COPTFLAGS)
 
 # --- Determine the archiver and related flags ---
diff --git a/config/cortex-a9/make_defs.mk b/config/cortex-a9/make_defs.mk
index 6f584f14c..ec5360da4 100644
--- a/config/cortex-a9/make_defs.mk
+++ b/config/cortex-a9/make_defs.mk
@@ -67,7 +67,7 @@ else
 COPTFLAGS      := -O2
 endif
 
-CVECFLAGS      := -march=armv7-a #-msse3 -march=native # -mfpmath=sse
+CVECFLAGS      := -march=armv7-a
 CKOPTFLAGS     := $(COPTFLAGS)
 
 # --- Determine the archiver and related flags ---
diff --git a/config/dunnington/make_defs.mk b/config/dunnington/make_defs.mk
index e67d45e85..8448b723f 100644
--- a/config/dunnington/make_defs.mk
+++ b/config/dunnington/make_defs.mk
@@ -47,9 +47,7 @@ ifeq ($(CC),)
 CC             := gcc
 CC_VENDOR      := gcc
 endif
-ifneq ($(CC_VENDOR),gcc)
-$(error gcc is required for this configuration.)
-endif
+
 # Enable IEEE Standard 1003.1-2004 (POSIX.1d). 
 # NOTE: This is needed to enable posix_memalign().
 CPPROCFLAGS    := -D_POSIX_C_SOURCE=200112L
@@ -67,9 +65,18 @@ else
 COPTFLAGS      := -O2 -fomit-frame-pointer
 endif
 
-CVECFLAGS      := -msse3 -march=native -mfpmath=sse
 CKOPTFLAGS     := $(COPTFLAGS)
 
+ifeq ($(CC_VENDOR),gcc)
+CVECFLAGS      := -msse3 -march=nehalem -mfpmath=sse
+else
+ifeq ($(CC_VENDOR),icc)
+CVECFLAGS      := -xSSE4.2
+else
+$(error gcc or icc is required for this configuration.)
+endif
+endif
+
 # --- Determine the archiver and related flags ---
 AR             := ar
 ARFLAGS        := cru
diff --git a/config/haswell/make_defs.mk b/config/haswell/make_defs.mk
index cbc11f37a..cb0fe5c11 100644
--- a/config/haswell/make_defs.mk
+++ b/config/haswell/make_defs.mk
@@ -47,9 +47,7 @@ ifeq ($(CC),)
 CC             := gcc
 CC_VENDOR      := gcc
 endif
-ifneq ($(CC_VENDOR),gcc)
-$(error gcc is required for this configuration.)
-endif
+
 # Enable IEEE Standard 1003.1-2004 (POSIX.1d). 
 # NOTE: This is needed to enable posix_memalign().
 CPPROCFLAGS    := -D_POSIX_C_SOURCE=200112L
@@ -67,9 +65,18 @@ else
 COPTFLAGS      := -O3
 endif
 
-CVECFLAGS      := -mavx2 -mfma -mfpmath=sse -march=native #-msse3 -march=native # -mfpmath=sse
 CKOPTFLAGS     := $(COPTFLAGS)
 
+ifeq ($(CC_VENDOR),gcc)
+CVECFLAGS      := -mavx2 -mfma -mfpmath=sse -march=haswell
+else
+ifeq ($(CC_VENDOR),icc)
+CVECFLAGS      := -xCORE-AVX2
+else
+$(error gcc or icc is required for this configuration.)
+endif
+endif
+
 # --- Determine the archiver and related flags ---
 AR             := ar
 ARFLAGS        := cru
diff --git a/config/loongson3a/make_defs.mk b/config/loongson3a/make_defs.mk
index 8bb13192c..bb1248d37 100644
--- a/config/loongson3a/make_defs.mk
+++ b/config/loongson3a/make_defs.mk
@@ -67,7 +67,7 @@ else
 COPTFLAGS      := -O3 -mtune=loongson3a
 endif
 
-CVECFLAGS      := -march=loongson3a #-msse3 -march=native # -mfpmath=sse
+CVECFLAGS      := -march=loongson3a
 CKOPTFLAGS     := $(COPTFLAGS)
 
 # --- Determine the archiver and related flags ---
diff --git a/config/piledriver/make_defs.mk b/config/piledriver/make_defs.mk
index 4708a8f36..e241789dd 100644
--- a/config/piledriver/make_defs.mk
+++ b/config/piledriver/make_defs.mk
@@ -67,7 +67,7 @@ else
 COPTFLAGS      := -O2 -fomit-frame-pointer
 endif
 
-CVECFLAGS      := -mavx -mfma -march=native -mfpmath=sse
+CVECFLAGS      := -mavx -mfma -march=bdver2 -mfpmath=sse
 CKOPTFLAGS     := $(COPTFLAGS)
 
 # --- Determine the archiver and related flags ---
diff --git a/config/reference/make_defs.mk b/config/reference/make_defs.mk
index b17e3a0ba..736e5ee4d 100644
--- a/config/reference/make_defs.mk
+++ b/config/reference/make_defs.mk
@@ -67,7 +67,7 @@ else
 COPTFLAGS      := -O2
 endif
 
-CVECFLAGS      := #-msse3 -march=native # -mfpmath=sse
+CVECFLAGS      := 
 CKOPTFLAGS     := $(COPTFLAGS)
 
 # --- Determine the archiver and related flags ---
diff --git a/config/sandybridge/make_defs.mk b/config/sandybridge/make_defs.mk
index c69387c7b..9f6c4366a 100644
--- a/config/sandybridge/make_defs.mk
+++ b/config/sandybridge/make_defs.mk
@@ -47,9 +47,7 @@ ifeq ($(CC),)
 CC             := gcc
 CC_VENDOR      := gcc
 endif
-ifneq ($(CC_VENDOR),gcc)
-$(error gcc is required for this configuration.)
-endif
+
 # Enable IEEE Standard 1003.1-2004 (POSIX.1d). 
 # NOTE: This is needed to enable posix_memalign().
 CPPROCFLAGS    := -D_POSIX_C_SOURCE=200112L
@@ -67,9 +65,18 @@ else
 COPTFLAGS      := -O3
 endif
 
-CVECFLAGS      := -mavx -mfpmath=sse -march=native #-msse3 -march=native # -mfpmath=sse
 CKOPTFLAGS     := $(COPTFLAGS)
 
+ifeq ($(CC_VENDOR),gcc)
+CVECFLAGS      := -mavx -mfpmath=sse -march=sandybridge
+else
+ifeq ($(CC_VENDOR),icc)
+CVECFLAGS      := -xAVX
+else
+$(error gcc or icc is required for this configuration.)
+endif
+endif
+
 # --- Determine the archiver and related flags ---
 AR             := ar
 ARFLAGS        := cru
diff --git a/config/template/make_defs.mk b/config/template/make_defs.mk
index b17e3a0ba..37de32882 100644
--- a/config/template/make_defs.mk
+++ b/config/template/make_defs.mk
@@ -67,7 +67,7 @@ else
 COPTFLAGS      := -O2
 endif
 
-CVECFLAGS      := #-msse3 -march=native # -mfpmath=sse
+CVECFLAGS      := #-msse3 -march=core2 # -mfpmath=sse
 CKOPTFLAGS     := $(COPTFLAGS)
 
 # --- Determine the archiver and related flags ---

From 469429ec34e5b1a172ce35596f9c7afdaacac131 Mon Sep 17 00:00:00 2001
From: Devin Matthews <dmatthews@utexas.edu>
Date: Fri, 25 Mar 2016 20:45:41 -0500
Subject: [PATCH 08/10]  Fix LD_FLAGS -> LDFLAGS.

---
 Makefile | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/Makefile b/Makefile
index 2b11f40a9..1f5ae5df7 100644
--- a/Makefile
+++ b/Makefile
@@ -189,11 +189,11 @@ THREADING_MODEL := omp
 endif
 ifeq ($(THREADING_MODEL),omp)
 CTHREADFLAGS := -fopenmp -DBLIS_ENABLE_OPENMP
-LD_FLAGS     += -fopenmp
+LDFLAGS      += -fopenmp
 endif
 ifeq ($(THREADING_MODEL),pthreads)
 CTHREADFLAGS := -pthread -DBLIS_ENABLE_PTHREADS
-LD_FLAGS     += -pthread
+LDFLAGS      += -pthread
 endif
 endif
 
@@ -203,11 +203,11 @@ THREADING_MODEL := omp
 endif
 ifeq ($(THREADING_MODEL),omp)
 CTHREADFLAGS := -openmp -DBLIS_ENABLE_OPENMP
-LD_FLAGS     += -openmp
+LDFLAGS      += -openmp
 endif
 ifeq ($(THREADING_MODEL),pthreads)
 CTHREADFLAGS := -pthread -DBLIS_ENABLE_PTHREADS
-LD_FLAGS     += -pthread
+LDFLAGS      += -pthread
 endif
 endif
 
@@ -220,7 +220,7 @@ $(error OpenMP is not supported with Clang.)
 endif
 ifeq ($(THREADING_MODEL),pthreads)
 CTHREADFLAGS := -pthread -DBLIS_ENABLE_PTHREADS
-LD_FLAGS     += -pthread
+LDFLAGS      += -pthread
 endif
 endif
 

From 0171ad58997b3a5a9b76301511dbe0751fffc940 Mon Sep 17 00:00:00 2001
From: Devin Matthews <dmatthews@utexas.edu>
Date: Mon, 28 Mar 2016 13:55:06 -0500
Subject: [PATCH 09/10] Add icc and clang support for Intel architectures,
 fixes #47. 2bd036f fixes #49 BTW.

---
 Makefile                        | 34 ++++++++++++++++++++++++++++++---
 build/config.mk.in              |  1 -
 config/dunnington/make_defs.mk  |  8 ++++++--
 config/haswell/make_defs.mk     |  8 ++++++--
 config/sandybridge/make_defs.mk |  8 ++++++--
 configure                       | 19 ------------------
 6 files changed, 49 insertions(+), 29 deletions(-)

diff --git a/Makefile b/Makefile
index 1f5ae5df7..6a4cc637a 100644
--- a/Makefile
+++ b/Makefile
@@ -160,6 +160,29 @@ GIT_LOG    := $(GIT) log --decorate
 
 
 
+#
+# --- Determine the compiler vendor --------------------------------------------
+#
+
+ifneq ($(CC),)
+
+VENDOR_STRING := $(shell $(CC) --version 2>/dev/null)
+ifeq ($(VENDOR_STRING),)
+VENDOR_STRING := $(shell $(CC) -qversion 2>/dev/null)
+endif
+ifeq ($(VENDOR_STRING),)
+$(error Unable to determine compiler vendor.)
+endif
+
+CC_VENDOR := $(firstword $(shell echo '$(VENDOR_STRING)' | grep -Eo 'icc|gcc|clang|emcc|pnacl|IBM'))
+ifeq ($(CC_VENDOR),)
+$(error Unable to determine compiler vendor.)
+endif
+
+endif
+
+
+
 #
 # --- Include makefile definitions file ----------------------------------------
 #
@@ -179,9 +202,11 @@ else
 MAKE_DEFS_MK_PRESENT := no
 endif
 
-# Deal with threading flags and aggregate all of the flags into multiple groups:
-# one for standard compilation, and one for each of the supported "special"
-# compilation modes.
+
+
+#
+# --- Configuration-agnostic flags ---------------------------------------------
+#
 
 ifeq ($(CC_VENDOR),gcc)
 ifeq ($(THREADING_MODEL),auto)
@@ -224,6 +249,9 @@ LDFLAGS      += -pthread
 endif
 endif
 
+# Aggregate all of the flags into multiple groups: one for standard compilation,
+# and one for each of the supported "special" compilation modes.
+
 CFLAGS_NOOPT   := $(CDBGFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CTHREADFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
 CFLAGS         := $(COPTFLAGS)  $(CVECFLAGS) $(CFLAGS_NOOPT)
 CFLAGS_KERNELS := $(CKOPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT)
diff --git a/build/config.mk.in b/build/config.mk.in
index a043d7aa9..8bdb427a0 100644
--- a/build/config.mk.in
+++ b/build/config.mk.in
@@ -50,7 +50,6 @@ DEBUG_TYPE      := @debug_type@
 
 # The C compiler.
 CC              := @CC@
-CC_VENDOR       := @cc_vendor@
 
 # The requested threading model.
 THREADING_MODEL := @threading_model@
diff --git a/config/dunnington/make_defs.mk b/config/dunnington/make_defs.mk
index 8448b723f..fed36506b 100644
--- a/config/dunnington/make_defs.mk
+++ b/config/dunnington/make_defs.mk
@@ -68,12 +68,16 @@ endif
 CKOPTFLAGS     := $(COPTFLAGS)
 
 ifeq ($(CC_VENDOR),gcc)
-CVECFLAGS      := -msse3 -march=nehalem -mfpmath=sse
+CVECFLAGS      := -msse3 -march=corei7 -mfpmath=sse
 else
 ifeq ($(CC_VENDOR),icc)
 CVECFLAGS      := -xSSE4.2
 else
-$(error gcc or icc is required for this configuration.)
+ifeq ($(CC_VENDOR),clang)
+CVECFLAGS      := -msse3 -mfpmath=sse -march=corei7
+else
+$(error gcc, icc, or clang is required for this configuration.)
+endif
 endif
 endif
 
diff --git a/config/haswell/make_defs.mk b/config/haswell/make_defs.mk
index cb0fe5c11..1640a40b9 100644
--- a/config/haswell/make_defs.mk
+++ b/config/haswell/make_defs.mk
@@ -68,12 +68,16 @@ endif
 CKOPTFLAGS     := $(COPTFLAGS)
 
 ifeq ($(CC_VENDOR),gcc)
-CVECFLAGS      := -mavx2 -mfma -mfpmath=sse -march=haswell
+CVECFLAGS      := -mavx2 -mfma -mfpmath=sse -march=core-avx2
 else
 ifeq ($(CC_VENDOR),icc)
 CVECFLAGS      := -xCORE-AVX2
 else
-$(error gcc or icc is required for this configuration.)
+ifeq ($(CC_VENDOR),clang)
+CVECFLAGS      := -mavx2 -mfma -mfpmath=sse -march=core-avx2
+else
+$(error gcc, icc, or clang is required for this configuration.)
+endif
 endif
 endif
 
diff --git a/config/sandybridge/make_defs.mk b/config/sandybridge/make_defs.mk
index 9f6c4366a..082a73f92 100644
--- a/config/sandybridge/make_defs.mk
+++ b/config/sandybridge/make_defs.mk
@@ -68,12 +68,16 @@ endif
 CKOPTFLAGS     := $(COPTFLAGS)
 
 ifeq ($(CC_VENDOR),gcc)
-CVECFLAGS      := -mavx -mfpmath=sse -march=sandybridge
+CVECFLAGS      := -mavx -mfpmath=sse -march=corei7-avx
 else
 ifeq ($(CC_VENDOR),icc)
 CVECFLAGS      := -xAVX
 else
-$(error gcc or icc is required for this configuration.)
+ifeq ($(CC_VENDOR),clang)
+CVECFLAGS      := -mavx -mfpmath=sse -march=corei7-avx
+else
+$(error gcc, icc, or clang is required for this configuration.)
+endif
 endif
 endif
 
diff --git a/configure b/configure
index 325120c32..20c4271ea 100755
--- a/configure
+++ b/configure
@@ -442,24 +442,6 @@ main()
 		echo "Unsupported threading model: ${threading_model}."
 		exit 1
 	fi
-	
-	
-	# Determine the compiler vendor if CC was specified.
-	if [ -n "$CC" ]; then
-		if $CC --version 2>/dev/null | grep -q 'pnacl-version'; then
-			cc_vendor='pnacl-clang'
-		else
-			cc_vendor=`$CC --version 2>/dev/null | grep -Eo 'icc|gcc|clang|emcc'`
-		fi
-		if [ -z "$cc_vendor" ]; then
-			cc_vendor=`$CC -qversion 2>/dev/null | grep -o 'IBM'`
-		fi
-		if [ -z "$cc_vendor" ]; then
-			echo "Unable to determine compiler vendor."
-			exit 1
-		fi
-		cc_vendor=`echo $cc_vendor | { read first rest; echo $first; }`
-	fi
 
 
 	# Insert escape characters into the paths used in the sed command below.
@@ -477,7 +459,6 @@ main()
 		| sed "s/@config_name@/${config_name}/g" \
 		| sed "s/@dist_path@/${dist_path_esc}/g" \
 		| sed "s/@CC@/${cc_esc}/g" \
-		| sed "s/@cc_vendor@/${cc_vendor}/g" \
 		| sed "s/@debug_type@/${debug_type}/g" \
 		| sed "s/@install_prefix@/${install_prefix_esc}/g" \
 		| sed "s/@enable_verbose@/${enable_verbose}/g" \

From 1b09e343dfe5b48b4842e2cb96f41c8cc249bad0 Mon Sep 17 00:00:00 2001
From: "Field G. Van Zee" <field@cs.utexas.edu>
Date: Tue, 29 Mar 2016 12:55:28 -0500
Subject: [PATCH 10/10] Updated gcc version from 4.8 to 4.9 in .travis.yml.

---
 .travis.yml | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index bfe7412a5..71875d79c 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -15,20 +15,19 @@ env:
   - RUN_TEST=0 BUILD_CONFIG="carrizo"
 
 install:
-- if [ "$CC" = "gcc" ]; then export CXX="g++-4.8" CC="gcc-4.8"; fi
+- if [ "$CC" = "gcc" ]; then export CC="gcc-4.9"; fi
 addons:
   apt:
     sources:
     - ubuntu-toolchain-r-test
     packages:
-    - gcc-4.8
-    - g++-4.8
+    - gcc-4.9
     - clang
 
 
 
 script:
   - ./configure $BUILD_CONFIG
-  - make CC=gcc-4.8
+  - make CC=gcc-4.9
   - if [ $RUN_TEST -eq 1 ]; then make BLIS_ENABLE_TEST_OUTPUT=yes test; fi
-  - if [ $RUN_TEST -eq 1 ]; then ./build/check-test.sh ./output.testsuite; fi
\ No newline at end of file
+  - if [ $RUN_TEST -eq 1 ]; then ./build/check-test.sh ./output.testsuite; fi