From a0a7b85ac3e157af53cff8db0e008f4a3f90372c Mon Sep 17 00:00:00 2001
From: Francisco Igual <figual@ucm.es>
Date: Tue, 27 Oct 2015 08:59:15 +0000
Subject: [PATCH] Fixed incomplete code in the double precision ARMv8
 microkernel.

---
 config/armv8a/bli_kernel.h               |  10 +-
 config/armv8a/make_defs.mk               |  18 +-
 kernels/armv8a/neon/3/bli_gemm_opt_4x4.c | 294 +++++++++++++++++++----
 3 files changed, 260 insertions(+), 62 deletions(-)

diff --git a/config/armv8a/bli_kernel.h b/config/armv8a/bli_kernel.h
index 2e2705122..3bd7da722 100644
--- a/config/armv8a/bli_kernel.h
+++ b/config/armv8a/bli_kernel.h
@@ -51,12 +51,12 @@
 //     (b) MR (for zero-padding purposes when MR and NR are "swapped")
 //
 
-#define BLIS_DEFAULT_MC_S              416 // 1280 //160 // 160 // 160 //2048 //336 
-#define BLIS_DEFAULT_KC_S              704 //1280 //672 //528 // 856 //2048 //528 
+#define BLIS_DEFAULT_MC_S              336
+#define BLIS_DEFAULT_KC_S              336
 #define BLIS_DEFAULT_NC_S              4096
 
-#define BLIS_DEFAULT_MC_D              80 //176 
-#define BLIS_DEFAULT_KC_D              336 //368 
+#define BLIS_DEFAULT_MC_D              160
+#define BLIS_DEFAULT_KC_D              304
 #define BLIS_DEFAULT_NC_D              4096
 
 #define BLIS_DEFAULT_MC_C              64
@@ -132,8 +132,6 @@
 //#define BLIS_PACKDIM_MR_Z              (BLIS_DEFAULT_MR_Z + ...)
 //#define BLIS_PACKDIM_NR_Z              (BLIS_DEFAULT_NR_Z + ...)
 
-
-
 // -- LEVEL-2 KERNEL CONSTANTS -------------------------------------------------
 
 
diff --git a/config/armv8a/make_defs.mk b/config/armv8a/make_defs.mk
index e81dc5026..63c03c6a0 100644
--- a/config/armv8a/make_defs.mk
+++ b/config/armv8a/make_defs.mk
@@ -76,20 +76,18 @@ GIT_LOG    := $(GIT) log --decorate
 #
 
 # --- Determine the C compiler and related flags ---
-##CC             := gcc
-CC             := aarch64-linux-gnu-gcc
-#CC             := arm-linux-gnueabihf-gcc-4.9.2 
+CC             := gcc
+
 # Enable IEEE Standard 1003.1-2004 (POSIX.1d). 
 # NOTE: This is needed to enable posix_memalign().
-CPPROCFLAGS    := -D_POSIX_C_SOURCE=200112L
-#CMISCFLAGS     := -std=c99 -mtune=cortex-a57 -mfpu=neon-fp-armv8 -march=armv8-a #-mfloat-abi=hard -mfpu=neon 
-CMISCFLAGS     := -std=c99 -march=armv8-a+fp+simd -ftree-vectorize -O3 -mcpu=cortex-a57 -mtune=cortex-a57 #-mtune=cortex-a57 -march=armv8-a -mfloat-abi=hard -mfpu=neon 
+CPPROCFLAGS    := -D_GNU_SOURCE
+CMISCFLAGS     := -std=c99 -march=armv8-a+fp+simd -ftree-vectorize -O3 -fopenmp -mcpu=cortex-a57.cortex-a53 -mtune=cortex-a57.cortex-a53
 CPICFLAGS      := -fPIC
-CDBGFLAGS      := -g
+CDBGFLAGS      := -g #-g3 -gdwarf-2 
 CWARNFLAGS     := -Wall
-COPTFLAGS      := -march=armv8-a+fp+simd -ftree-vectorize -O3 -mcpu=cortex-a57 -mtune=cortex-a57 #-march=armv8-a -O2 -mtune=cortex-a57 -mfpu=neon-fp-armv8 #-mfpu=neon -O2
+COPTFLAGS      := -march=armv8-a+fp+simd -ftree-vectorize -O3 -mcpu=cortex-a57.cortex-a53 -mtune=cortex-a57.cortex-a53
 CKOPTFLAGS     := $(COPTFLAGS)
-CVECFLAGS      := -march=armv8-a+fp+simd -ftree-vectorize -O3 -mcpu=cortex-a57 -mtune=cortex-a57 #-march=armv8-a -O2 -mtune=cortex-a57 -mfpu=neon-fp-armv8 
+CVECFLAGS      := #-march=armv8-a+fp+simd -ftree-vectorize -O3 -mcpu=cortex-a57.cortex-a53 -mtune=cortex-a57.cortex-a53 #-march=armv8-a -O2 -mtune=cortex-a57 -mfpu=neon-fp-armv8 
 
 # Aggregate all of the flags into multiple groups: one for standard
 # compilation, and one for each of the supported "special" compilation
@@ -105,7 +103,7 @@ ARFLAGS        := cru
 # --- Determine the linker and related flags ---
 LINKER         := $(CC)
 SOFLAGS        := -shared
-LDFLAGS        := -lm
+LDFLAGS        := -lm -fopenmp
 
 
 
diff --git a/kernels/armv8a/neon/3/bli_gemm_opt_4x4.c b/kernels/armv8a/neon/3/bli_gemm_opt_4x4.c
index ba98d8dd3..2a54fe825 100644
--- a/kernels/armv8a/neon/3/bli_gemm_opt_4x4.c
+++ b/kernels/armv8a/neon/3/bli_gemm_opt_4x4.c
@@ -1,4 +1,4 @@
-/*
+    /* 
 
    BLIS    
    An object-based framework for developing high-performance BLAS-like
@@ -36,12 +36,7 @@
 #include "blis.h"
 
 /*
-   o 4x4 Single precision micro-kernel fully functional.
-   o Runnable on ARMv8, compiled with aarch64 GCC.
-   o Use it together with the armv8 BLIS configuration.
    o Tested on Juno board. Around 7.3 GFLOPS @ 1.1 GHz. 
-
-   December 2014.
 */
 void bli_sgemm_opt_4x4(
                         dim_t              k,
@@ -55,15 +50,15 @@ void bli_sgemm_opt_4x4(
 {
 	void* a_next = bli_auxinfo_next_a( data );
 	void* b_next = bli_auxinfo_next_b( data );
-
+        
 	dim_t k_iter = k / 4;
-	dim_t k_left = k % 4;
+	dim_t k_left = k % 4; 
 
-__asm__ volatile
+__asm__ volatile 
 (
 "                                            \n\t"
 "                                            \n\t"
-" ldr x0,%[aaddr]                            \n\t" // Load address of A.
+" ldr x0,%[aaddr]                            \n\t" // Load address of A. 
 " ldr x1,%[baddr]                            \n\t" // Load address of B.
 " ldr x2,%[caddr]                            \n\t" // Load address of C.
 "                                            \n\t"
@@ -433,12 +428,8 @@ __asm__ volatile
 
 
 /*
-   o 4x4 Double precision micro-kernel NOT fully functional yet.
-   o Runnable on ARMv8, compiled with aarch64 GCC.
-   o Use it together with the armv8 BLIS configuration.
-   o Tested on Juno board. Around 3 GFLOPS @ 1.1 GHz. 
-
-   December 2014.
+ * Tested on Juno board. Around 5.6 GFLOPS, 2 A57 cores @ 1.1 GHz.
+ * Tested on Juno board. Around 4 GFLOPS, 4 A53 cores @ 850 MHz.
 */
 void bli_dgemm_opt_4x4(
                         dim_t              k,
@@ -458,8 +449,8 @@ void bli_dgemm_opt_4x4(
 
 __asm__ volatile
 (
-"                                            \n\t"
-" ldr x0,%[aaddr]                            \n\t" // Load address of A
+"                                            \n\t" 
+" ldr x0,%[aaddr]                            \n\t" // Load address of A 
 " ldr x1,%[baddr]                            \n\t" // Load address of B
 " ldr x2,%[caddr]                            \n\t" // Load address of C
 "                                            \n\t"
@@ -476,9 +467,16 @@ __asm__ volatile
 "                                            \n\t" 
 " ldr x9,%[cs_c]                             \n\t" // Load cs_c
 " lsl x10,x9,#3                              \n\t" // cs_c * sizeof(double)
-"                                            \n\t" 
-" ldp q0,q1,[x0],32                          \n\t" // Load a    into quad
-" ldp q4,q5,[x1],32                          \n\t" // Load b    into quad
+" lsl x11,x9,#4                              \n\t" // 2 * cs_c * sizeof(double) -- AUX.
+" lsl x12,x9,#5                              \n\t" // 3 * cs_c * sizeof(double) -- AUX.
+"                                            \n\t"
+" ldr x13,%[rs_c]                            \n\t" // Load rs_c.
+" lsl x14,x13,#3                             \n\t" // rs_c * sizeof(double). 
+"                                            \n\t"
+" prfm pldl1keep,[x2,0]                      \n\t" // Prefetch c.
+" prfm pldl1keep,[x2,x10]                    \n\t" // Prefetch c.
+" prfm pldl1keep,[x2,x11]                    \n\t" // Prefetch c.
+" prfm pldl1keep,[x2,x12]                    \n\t" // Prefetch c.
 "                                            \n\t"
 " movi v12.2d,#0                             \n\t" // Vector for accummulating column 0 
 " movi v13.2d,#0                             \n\t" // Vector for accummulating column 0
@@ -489,10 +487,33 @@ __asm__ volatile
 " movi v18.2d,#0                             \n\t" // Vector for accummulating column 3
 " movi v19.2d,#0                             \n\t" // Vector for accummulating column 3
 "                                            \n\t"
+" movi v20.2d,#0                             \n\t" // Vector for accummulating column 0 
+" movi v21.2d,#0                             \n\t" // Vector for accummulating column 0
+" movi v22.2d,#0                             \n\t" // Vector for accummulating column 1
+" movi v23.2d,#0                             \n\t" // Vector for accummulating column 1
+" movi v24.2d,#0                             \n\t" // Vector for accummulating column 2 
+" movi v25.2d,#0                             \n\t" // Vector for accummulating column 2
+" movi v26.2d,#0                             \n\t" // Vector for accummulating column 3
+" movi v27.2d,#0                             \n\t" // Vector for accummulating column 3
+"                                            \n\t"
 " ld1r {v31.2d},[x8]                         \n\t" // Load beta
 "                                            \n\t"
+"                                            \n\t"
+" cmp x5,#0                                  \n\t" // If k_iter == 0, jump to k_left.
+" beq .DCONSIDERKLEFT                        \n\t"
+"                                            \n\t"
+" ldp q0,q1,[x0],32                          \n\t" // Load a
+" ldp q4,q5,[x1],32                          \n\t" // Load b
+"                                            \n\t"
+" cmp x5,1                                   \n\t" // If there is just one k_iter, jump to that one. 
+" beq .DLASTITER                             \n\t" // (as loop is do-while-like).
+"                                            \n\t" 
+"                                            \n\t"
 " DLOOP:                                     \n\t" // Body
 "                                            \n\t"
+" prfm pldl1keep,[x0,#1024]                  \n\t" // Prefetch.    
+" prfm pldl1keep,[x1,#1024]                  \n\t" // Prefetch.    
+"                                            \n\t"
 " ldp q6,q7,[x1],32                          \n\t" // Load b+4  into quad
 "                                            \n\t"
 " fmla v12.2d,v0.2d,v4.d[0]                  \n\t" // Accummulate
@@ -525,9 +546,86 @@ __asm__ volatile
 " fmla v17.2d,v3.2d,v7.d[0]                  \n\t" // Accummulate
 " fmla v19.2d,v3.2d,v7.d[1]                  \n\t" // Accummulate
 "                                            \n\t"
-" add x4,x4,1                                \n\t" // i = i+1
-" cmp x4,x5                                  \n\t" // Continue
-" blt DLOOP                                  \n\t" // if i < N 
+" prfm pldl1keep,[x0,#64]                    \n\t" // Prefetch.  
+" prfm pldl1keep,[x1,#64]                    \n\t" // Prefetch.   
+"                                            \n\t"
+" sub x5,x5,1                                \n\t" // i-=1
+" cmp x5,1                                   \n\t" // Iterate again if we are not in k_iter == 1.
+"bne DLOOP                                   \n\t"
+"                                            \n\t"
+".DLASTITER:                                 \n\t"
+"                                            \n\t"
+" ldp q6,q7,[x1],32                          \n\t" // Load b+4  into quad
+"                                            \n\t"
+" fmla v12.2d,v0.2d,v4.d[0]                  \n\t" // Accummulate
+" fmla v14.2d,v0.2d,v4.d[1]                  \n\t" // Accummulate
+"                                            \n\t"
+" fmla v13.2d,v1.2d,v4.d[0]                  \n\t" // Accummulate
+" fmla v15.2d,v1.2d,v4.d[1]                  \n\t" // Accummulate
+"                                            \n\t"
+" ldp q2,q3,[x0],32                          \n\t" // Load a+4  into quad
+"                                            \n\t"
+" fmla v16.2d,v0.2d,v5.d[0]                  \n\t" // Accummulate
+" fmla v18.2d,v0.2d,v5.d[1]                  \n\t" // Accummulate
+"                                            \n\t"
+" fmla v17.2d,v1.2d,v5.d[0]                  \n\t" // Accummulate
+" fmla v19.2d,v1.2d,v5.d[1]                  \n\t" // Accummulate
+"                                            \n\t"
+" ld1r {v30.2d},[x7]                         \n\t" // Load alpha.
+"                                            \n\t"
+" fmla v12.2d,v2.2d,v6.d[0]                  \n\t" // Accummulate
+" fmla v14.2d,v2.2d,v6.d[1]                  \n\t" // Accummulate
+"                                            \n\t"
+" fmla v13.2d,v3.2d,v6.d[0]                  \n\t" // Accummulate
+" fmla v15.2d,v3.2d,v6.d[1]                  \n\t" // Accummulate
+"                                            \n\t"
+" fmla v16.2d,v2.2d,v7.d[0]                  \n\t" // Accummulate
+" fmla v18.2d,v2.2d,v7.d[1]                  \n\t" // Accummulate
+"                                            \n\t"
+" fmla v17.2d,v3.2d,v7.d[0]                  \n\t" // Accummulate
+" fmla v19.2d,v3.2d,v7.d[1]                  \n\t" // Accummulate
+"                                            \n\t"
+" .DCONSIDERKLEFT:                           \n\t" 
+" cmp x6,0                                   \n\t" // If k_left == 0, we are done.
+" beq .DPOSTACCUM                            \n\t" // else, we enter the k_left loop.
+"                                            \n\t"
+".DLOOPKLEFT:                                \n\t"
+"                                            \n\t"
+" prfm pldl1keep,[x0,#1024]                  \n\t" // Prefetch.
+" prfm pldl1keep,[x1,#1024]                  \n\t" // Prefetch.
+"                                            \n\t"
+" ldp q0,q1,[x0],32                          \n\t" // Load a    into quad
+" ldp q4,q5,[x1],32                          \n\t" // Load b    into quad
+"                                            \n\t"
+//" sub x6,x6,1                                \n\t"
+"                                            \n\t"
+" fmla v12.2d,v0.2d,v4.d[0]                  \n\t" // Accummulate
+" fmla v14.2d,v0.2d,v4.d[1]                  \n\t" // Accummulate
+"                                            \n\t"
+" fmla v13.2d,v1.2d,v4.d[0]                  \n\t" // Accummulate
+" fmla v15.2d,v1.2d,v4.d[1]                  \n\t" // Accummulate
+"                                            \n\t"
+" fmla v16.2d,v0.2d,v5.d[0]                  \n\t" // Accummulate
+" fmla v18.2d,v0.2d,v5.d[1]                  \n\t" // Accummulate
+"                                            \n\t"
+" fmla v17.2d,v1.2d,v5.d[0]                  \n\t" // Accummulate
+" fmla v19.2d,v1.2d,v5.d[1]                  \n\t" // Accummulate
+"                                            \n\t"
+//" cmp x6,0                                   \n\t" // Iterate again.
+//" bne .DLOOPKLEFT                            \n\t" // if i!=0.
+"                                            \n\t"
+" .DPOSTACCUM:                               \n\t"
+" ld1r {v30.2d},[x7]                         \n\t" // Load alpha.
+"                                            \n\t"
+" cmp x13,#1                                 \n\t" // If rs_c != 1 (column-major)
+" bne .DGENSTORED                            \n\t"
+"                                            \n\t"
+" .DCOLSTORED:                               \n\t" // C is column-major.
+" fcmp d31,#0.0                              \n\t"
+" beq .DBETAZEROCOLSTORED                    \n\t" // Taking care of the beta==0 case.
+"                                            \n\t"
+" ldr x2,%[caddr]                            \n\t" // Load address of C.
+"                                            \n\t"
 "                                            \n\t"
 " ldp q0,q1,[x2]                             \n\t" // Load c    into quad and increment by cs_c
 " add x2,x2,x10                              \n\t"
@@ -537,38 +635,140 @@ __asm__ volatile
 " add x2,x2,x10                              \n\t"
 " ldp q6,q7,[x2]                             \n\t" // Load c    into quad and increment by cs_c
 "                                            \n\t"
-" ld1r {v30.2d},[x7]                         \n\t" // Load alpha
 "                                            \n\t"
-" fmul v0.2d,v0.2d,v31.d[0]                  \n\t" // Scale by beta
-" fmul v1.2d,v1.2d,v31.d[0]                  \n\t" // Scale by beta
-" fmul v2.2d,v2.2d,v31.d[0]                  \n\t" // Scale by beta
-" fmul v3.2d,v3.2d,v31.d[0]                  \n\t" // Scale by beta
-" fmul v4.2d,v4.2d,v31.d[0]                  \n\t" // Scale by beta
-" fmul v5.2d,v5.2d,v31.d[0]                  \n\t" // Scale by beta
-" fmul v6.2d,v6.2d,v31.d[0]                  \n\t" // Scale by beta
-" fmul v7.2d,v7.2d,v31.d[0]                  \n\t" // Scale by beta
+" fmul v20.2d,v0.2d,v31.d[0]                 \n\t" // Scale by beta
+" fmul v21.2d,v1.2d,v31.d[0]                 \n\t" // Scale by beta
+" fmul v22.2d,v2.2d,v31.d[0]                 \n\t" // Scale by beta
+" fmul v23.2d,v3.2d,v31.d[0]                 \n\t" // Scale by beta
+" fmul v24.2d,v4.2d,v31.d[0]                 \n\t" // Scale by beta
+" fmul v25.2d,v5.2d,v31.d[0]                 \n\t" // Scale by beta
+" fmul v26.2d,v6.2d,v31.d[0]                 \n\t" // Scale by beta
+" fmul v27.2d,v7.2d,v31.d[0]                 \n\t" // Scale by beta
 "                                            \n\t"
 " prfm pldl2keep,[x16]                       \n\t"
 " prfm pldl2keep,[x17]                       \n\t"
 "                                            \n\t"
-" fmla v0.2d,v12.2d,v30.d[0]                 \n\t" // Scale by alpha
-" fmla v1.2d,v13.2d,v30.d[0]                 \n\t" // Scale by alpha
-" fmla v2.2d,v14.2d,v30.d[0]                 \n\t" // Scale by alpha
-" fmla v3.2d,v15.2d,v30.d[0]                 \n\t" // Scale by alpha
-" fmla v4.2d,v16.2d,v30.d[0]                 \n\t" // Scale by alpha
-" fmla v5.2d,v17.2d,v30.d[0]                 \n\t" // Scale by alpha
-" fmla v6.2d,v18.2d,v30.d[0]                 \n\t" // Scale by alpha
-" fmla v7.2d,v19.2d,v30.d[0]                 \n\t" // Scale by alpha
+" .DBETAZEROCOLSTORED:                       \n\t" // If beta==0, we won't read from C (nor scale).
 "                                            \n\t"
 " ldr x2,%[caddr]                            \n\t" // Load address of C
 "                                            \n\t"
-" stp q0,q1,[x2]                             \n\t" // Store quad into c    and increment by cs_c
+" fmla v20.2d,v12.2d,v30.d[0]                \n\t" // Scale by alpha
+" fmla v21.2d,v13.2d,v30.d[0]                \n\t" // Scale by alpha
+" fmla v22.2d,v14.2d,v30.d[0]                \n\t" // Scale by alpha
+" fmla v23.2d,v15.2d,v30.d[0]                \n\t" // Scale by alpha
+" fmla v24.2d,v16.2d,v30.d[0]                \n\t" // Scale by alpha
+" fmla v25.2d,v17.2d,v30.d[0]                \n\t" // Scale by alpha
+" fmla v26.2d,v18.2d,v30.d[0]                \n\t" // Scale by alpha
+" fmla v27.2d,v19.2d,v30.d[0]                \n\t" // Scale by alpha
+"                                            \n\t"
+" stp q20,q21,[x2]                           \n\t" // Store quad into c    and increment by cs_c
 " add x2,x2,x10                              \n\t"
-" stp q2,q3,[x2]                             \n\t" // Store quad into c+4  and increment by cs_c
+" stp q22,q23,[x2]                           \n\t" // Store quad into c+4  and increment by cs_c
 " add x2,x2,x10                              \n\t"
-" stp q4,q5,[x2]                             \n\t" // Store quad into c+8  and increment by cs_c
+" stp q24,q25,[x2]                           \n\t" // Store quad into c+8  and increment by cs_c
 " add x2,x2,x10                              \n\t"
-" stp q6,q7,[x2]                             \n\t" // Store quad into c+16 and increment by cs_c
+" stp q26,q27,[x2]                           \n\t" // Store quad into c+16 and increment by cs_c
+"                                            \n\t"
+" b .DEND                                    \n\t" // Done (TODO: this obviously needs to be moved down to remove jump).
+"                                            \n\t"
+" .DGENSTORED:                               \n\t" // C is general-stride stored.
+"                                            \n\t"
+" fcmp d31,#0.0                              \n\t"
+" beq .DBETAZEROGENSTORED                    \n\t"
+"                                            \n\t" // If beta!=0, then we can read from C.
+"                                            \n\t" // TODO: this was done fast. Rearrange to remove so many address reloads.
+" ldr x2,%[caddr]                            \n\t" // Load address of C.
+"                                            \n\t"
+" ld1 {v0.d}[0],[x2],x14                     \n\t" // Load c00  into quad and increment by rs_c.
+" ld1 {v0.d}[1],[x2],x14                     \n\t" // Load c01  into quad and increment by rs_c.
+" ld1 {v1.d}[0],[x2],x14                     \n\t" // Load c02  into quad and increment by rs_c.
+" ld1 {v1.d}[1],[x2],x14                     \n\t" // Load c03  into quad and increment by rs_c.
+"                                            \n\t"
+" ldr x2,%[caddr]                            \n\t" // Load address of C.
+" add x2,x2,x10                              \n\t" // c += cs_c.
+"                                            \n\t"
+" ld1 {v2.d}[0],[x2],x14                     \n\t" // Load c10  into quad and increment by rs_c.
+" ld1 {v2.d}[1],[x2],x14                     \n\t" // Load c11  into quad and increment by rs_c.
+" ld1 {v3.d}[0],[x2],x14                     \n\t" // Load c12  into quad and increment by rs_c.
+" ld1 {v3.d}[1],[x2],x14                     \n\t" // Load c13  into quad and increment by rs_c.
+"                                            \n\t"
+" ldr x2,%[caddr]                            \n\t" // Load address of C.
+" add x2,x2,x10                              \n\t" // c += cs_c.
+" add x2,x2,x10                              \n\t" // c += cs_c.
+"                                            \n\t"
+" ld1 {v4.d}[0],[x2],x14                    \n\t" // Load c10  into quad and increment by rs_c.
+" ld1 {v4.d}[1],[x2],x14                    \n\t" // Load c11  into quad and increment by rs_c.
+" ld1 {v5.d}[0],[x2],x14                    \n\t" // Load c12  into quad and increment by rs_c.
+" ld1 {v5.d}[1],[x2],x14                    \n\t" // Load c13  into quad and increment by rs_c.
+"                                            \n\t"
+" ldr x2,%[caddr]                            \n\t" // Load address of C.
+" add x2,x2,x10                              \n\t" // c += cs_c.
+" add x2,x2,x10                              \n\t" // c += cs_c.
+" add x2,x2,x10                              \n\t" // c += cs_c.
+"                                            \n\t"
+" ld1 {v6.d}[0],[x2],x14                    \n\t" // Load c10  into quad and increment by rs_c.
+" ld1 {v6.d}[1],[x2],x14                    \n\t" // Load c11  into quad and increment by rs_c.
+" ld1 {v7.d}[0],[x2],x14                    \n\t" // Load c12  into quad and increment by rs_c.
+" ld1 {v7.d}[1],[x2],x14                    \n\t" // Load c13  into quad and increment by rs_c.
+"                                            \n\t"
+" prfm pldl1keep,[x16,0]                     \n\t" // Prefetch.
+" prfm pldl1keep,[x17,0]                     \n\t" // Prefetch.
+"                                            \n\t"
+" fmul v20.2d,v0.2d,v31.d[0]                 \n\t" // Scale by beta
+" fmul v21.2d,v1.2d,v31.d[0]                 \n\t" // Scale by beta
+" fmul v22.2d,v2.2d,v31.d[0]                 \n\t" // Scale by beta
+" fmul v23.2d,v3.2d,v31.d[0]                 \n\t" // Scale by beta
+" fmul v24.2d,v4.2d,v31.d[0]                 \n\t" // Scale by beta
+" fmul v25.2d,v5.2d,v31.d[0]                 \n\t" // Scale by beta
+" fmul v26.2d,v6.2d,v31.d[0]                 \n\t" // Scale by beta
+" fmul v27.2d,v7.2d,v31.d[0]                 \n\t" // Scale by beta
+"                                            \n\t"
+" .DBETAZEROGENSTORED:                       \n\t" // If beta==0, we cannot read from C (nor scale).
+"                                            \n\t"
+" fmla v20.2d,v12.2d,v30.d[0]                \n\t" // Scale by alpha
+" fmla v21.2d,v13.2d,v30.d[0]                \n\t" // Scale by alpha
+" fmla v22.2d,v14.2d,v30.d[0]                \n\t" // Scale by alpha
+" fmla v23.2d,v15.2d,v30.d[0]                \n\t" // Scale by alpha
+" fmla v24.2d,v16.2d,v30.d[0]                \n\t" // Scale by alpha
+" fmla v25.2d,v17.2d,v30.d[0]                \n\t" // Scale by alpha
+" fmla v26.2d,v18.2d,v30.d[0]                \n\t" // Scale by alpha
+" fmla v27.2d,v19.2d,v30.d[0]                \n\t" // Scale by alpha
+"                                            \n\t"
+" ldr x2,%[caddr]                            \n\t" // Load address of C.
+"                                            \n\t"
+" st1 {v20.d}[0],[x2],x14                     \n\t" // Store c00  into quad and increment by rs_c.
+" st1 {v20.d}[1],[x2],x14                     \n\t" // Store c01  into quad and increment by rs_c.
+" st1 {v21.d}[0],[x2],x14                     \n\t" // Store c02  into quad and increment by rs_c.
+" st1 {v21.d}[1],[x2],x14                     \n\t" // Store c03  into quad and increment by rs_c.
+"                                            \n\t"
+" ldr x2,%[caddr]                            \n\t" // Load address of C.
+" add x2,x2,x10                              \n\t" // c += cs_c.
+"                                            \n\t"
+" st1 {v22.d}[0],[x2],x14                     \n\t" // Store c10  into quad and increment by rs_c.
+" st1 {v22.d}[1],[x2],x14                     \n\t" // Store c11  into quad and increment by rs_c.
+" st1 {v23.d}[0],[x2],x14                     \n\t" // Store c12  into quad and increment by rs_c.
+" st1 {v23.d}[1],[x2],x14                     \n\t" // Store c13  into quad and increment by rs_c.
+"                                            \n\t"
+" ldr x2,%[caddr]                            \n\t" // Load address of C.
+" add x2,x2,x10                              \n\t" // c += cs_c.
+" add x2,x2,x10                              \n\t" // c += cs_c.
+"                                            \n\t"
+" st1 {v24.d}[0],[x2],x14                    \n\t" // Store c10  into quad and increment by rs_c.
+" st1 {v24.d}[1],[x2],x14                    \n\t" // Store c11  into quad and increment by rs_c.
+" st1 {v25.d}[0],[x2],x14                    \n\t" // Store c12  into quad and increment by rs_c.
+" st1 {v25.d}[1],[x2],x14                    \n\t" // Store c13  into quad and increment by rs_c.
+"                                            \n\t"
+" ldr x2,%[caddr]                            \n\t" // Load address of C.
+" add x2,x2,x10                              \n\t" // c += cs_c.
+" add x2,x2,x10                              \n\t" // c += cs_c.
+" add x2,x2,x10                              \n\t" // c += cs_c.
+"                                            \n\t"
+" st1 {v26.d}[0],[x2],x14                    \n\t" // Store c10  into quad and increment by rs_c.
+" st1 {v26.d}[1],[x2],x14                    \n\t" // Store c11  into quad and increment by rs_c.
+" st1 {v27.d}[0],[x2],x14                    \n\t" // Store c12  into quad and increment by rs_c.
+" st1 {v27.d}[1],[x2],x14                    \n\t" // Store c13  into quad and increment by rs_c.
+"                                            \n\t"
+" .DEND:                                     \n\t" // Done!
 "                                            \n\t"
 :// output operands (none)
 :// input operands
@@ -587,13 +787,15 @@ __asm__ volatile
  "x0","x1","x2",
  "x4","x5","x6",
  "x7","x8","x9",
- "x10","x11","x12",
+ "x10","x11","x12","x13","x14","x16","x17",
  "v0","v1","v2",
  "v3","v4","v5",
  "v6","v7","v8",
  "v9","v10","v11",
  "v12","v13","v14",
  "v15","v16","v17","v18","v19",
+ "v20","v21","v22","v23",
+ "v24","v25","v26","v27",
  "v30","v31"
 );