From 7fabd896af773623ed01820a71bbff432e8a7d25 Mon Sep 17 00:00:00 2001
From: RuQing Xu <r-xu@g.ecc.u-tokyo.ac.jp>
Date: Sat, 29 May 2021 16:28:03 +0900
Subject: [PATCH 1/4] Asm Flag Mingling for Darwin_Aarch64

Apple+Arm64 requires additional "tagging" of local symbols.
---
 kernels/armv8a/3/armv8a_asm_utils.h         |  49 ++++++++
 kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c | 125 ++++++++++----------
 2 files changed, 112 insertions(+), 62 deletions(-)
 create mode 100644 kernels/armv8a/3/armv8a_asm_utils.h

diff --git a/kernels/armv8a/3/armv8a_asm_utils.h b/kernels/armv8a/3/armv8a_asm_utils.h
new file mode 100644
index 000000000..7bf97d555
--- /dev/null
+++ b/kernels/armv8a/3/armv8a_asm_utils.h
@@ -0,0 +1,49 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2021, The University of Tokyo
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+*/
+
+// Apple's local label requirements.
+#if defined(__APPLE__)
+#define LABEL(str) "   L" #str": \n\t"
+#define BEQ(str) "b.eq L" #str"  \n\t"
+#define BNE(str) "b.ne L" #str"  \n\t"
+#define BRANCH(str) "b L" #str"  \n\t"
+#else
+#define LABEL(str) "   ." #str": \n\t"
+#define BEQ(str) "b.eq ." #str"  \n\t"
+#define BNE(str) "b.ne ." #str"  \n\t"
+#define BRANCH(str) "b ." #str"  \n\t"
+#endif
+
diff --git a/kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c b/kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c
index c01c67f5a..251931f7c 100644
--- a/kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c
+++ b/kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c
@@ -34,6 +34,7 @@
 */
 
 #include "blis.h"
+#include "armv8a_asm_utils.h"
 
 /*
    o 4x4 Single precision micro-kernel fully functional.
@@ -155,7 +156,7 @@ __asm__ volatile
 " dup  v31.4s, wzr                           \n\t" // Vector for accummulating column 11
 "                                            \n\t"
 " cmp x5,#0                                  \n\t" // If k_iter == 0, jump to k_left.
-" beq .SCONSIDERKLEFT                        \n\t"
+BEQ(SCONSIDERKLEFT)
 "                                            \n\t"
 " ldr q0, [x0]                               \n\t"
 " ldr q1, [x0, #16]                          \n\t" // Load a
@@ -168,9 +169,9 @@ __asm__ volatile
 " add x1, x1, #48                            \n\t" //update address of B
 "                                            \n\t"
 " cmp x5,1                                   \n\t" // If there is just one k_iter, jump to that one. 
-" beq .SLASTITER                             \n\t" // (as loop is do-while-like).
+BEQ(SLASTITER)                                     // (as loop is do-while-like).
 "                                            \n\t"
-" .SLOOPKITER:                               \n\t" // Body of the k_iter loop.
+LABEL(SLOOPKITER)                                  // Body of the k_iter loop.
 "                                            \n\t"
 " ldr q5, [x0]                               \n\t"
 " fmla v8.4s, v0.4s,v2.s[0]                  \n\t" // Accummulate.
@@ -316,9 +317,9 @@ __asm__ volatile
 "                                            \n\t" //End It 4
 " sub x5,x5,1                                \n\t" // i-=1.
 " cmp x5,1                                   \n\t" // Iterate again if we are not in k_iter == 1.
-" bne .SLOOPKITER                            \n\t"
+BNE(SLOOPKITER)
 "                                            \n\t" 
-" .SLASTITER:                                \n\t" // Last iteration of k_iter loop.
+LABEL(SLASTITER)                                   // Last iteration of k_iter loop.
 "                                            \n\t" 
 "                                            \n\t"
 " ldr q5, [x0]                               \n\t"
@@ -454,11 +455,11 @@ __asm__ volatile
 " add x0, x0, #96                            \n\t"
 "                                            \n\t" //End It 4
 "                                            \n\t"
-" .SCONSIDERKLEFT:                           \n\t" 
+LABEL(SCONSIDERKLEFT)
 " cmp x6,0                                   \n\t" // If k_left == 0, we are done.
-" beq .SPOSTACCUM                            \n\t" // else, we enter the k_left loop.
+BEQ(SPOSTACCUM)                                    // else, we enter the k_left loop.
 "                                            \n\t"
-" .SLOOPKLEFT:                               \n\t" // Body of the left iterations
+LABEL(SLOOPKLEFT)                                  // Body of the left iterations
 "                                            \n\t"
 " ldr q0, [x0],#16                           \n\t"
 " ldr q1, [x0],#16                           \n\t" // Load a
@@ -497,17 +498,17 @@ __asm__ volatile
 " fmla v31.4s,v1.4s,v4.s[3]                  \n\t" // Accummulate.
 "                                            \n\t"
 " cmp x6,0                                   \n\t" // Iterate again.
-" bne .SLOOPKLEFT                            \n\t" // if i!=0.
+BNE(SLOOPKLEFT)                                    // if i!=0.
 "                                            \n\t"
-" .SPOSTACCUM:                               \n\t"
+LABEL(SPOSTACCUM)
 "                                            \n\t"
 " ld1r {v6.4s},[x7]                          \n\t" // Load alpha.
 " ld1r {v7.4s},[x8]                          \n\t" // Load beta
 "                                            \n\t"
 " cmp x13,#1                                 \n\t" // If rs_c != 1 (column-major)
-" bne .SGENSTORED                            \n\t"
+BNE(SGENSTORED)
 "                                            \n\t"
-" .SCOLSTORED:                               \n\t" // C is column-major.
+LABEL(SCOLSTORED)                                  // C is column-major.
 "                                            \n\t"
 " dup  v0.4s, wzr                            \n\t"
 " dup  v1.4s, wzr                            \n\t"
@@ -517,7 +518,7 @@ __asm__ volatile
 " dup  v5.4s, wzr                            \n\t"
 "                                            \n\t"
 " fcmp s7,#0.0                               \n\t"
-" beq .SBETAZEROCOLSTOREDS1                  \n\t" // Taking care of the beta==0 case.
+BEQ(SBETAZEROCOLSTOREDS1)                          // Taking care of the beta==0 case.
 "                                            \n\t"
 " ldr q0, [x2]                               \n\t" //Load column 0 of C
 " ldr q1, [x2, #16]                          \n\t"
@@ -533,7 +534,7 @@ __asm__ volatile
 " fmul v4.4s,v4.4s,v7.s[0]                   \n\t" // Scale by beta
 " fmul v5.4s,v5.4s,v7.s[0]                   \n\t" // Scale by beta
 "                                            \n\t"
-" .SBETAZEROCOLSTOREDS1:                     \n\t"
+LABEL(SBETAZEROCOLSTOREDS1)
 "                                            \n\t"
 " fmla v0.4s,v8.4s,v6.s[0]                   \n\t" // Scale by alpha
 " fmla v1.4s,v9.4s,v6.s[0]                   \n\t" // Scale by alpha
@@ -557,7 +558,7 @@ __asm__ volatile
 " dup  v13.4s, wzr                           \n\t"
 "                                            \n\t"
 " fcmp s7,#0.0                               \n\t"
-" beq .SBETAZEROCOLSTOREDS2                  \n\t" // Taking care of the beta==0 case.
+BEQ(SBETAZEROCOLSTOREDS2)                          // Taking care of the beta==0 case.
 "                                            \n\t"
 " ldr q8, [x18]                              \n\t" //Load column 3 of C
 " ldr q9, [x18, #16]                         \n\t"
@@ -573,7 +574,7 @@ __asm__ volatile
 " fmul v12.4s,v12.4s,v7.s[0]                 \n\t" // Scale by beta
 " fmul v13.4s,v13.4s,v7.s[0]                 \n\t" // Scale by beta
 "                                            \n\t"
-" .SBETAZEROCOLSTOREDS2:                     \n\t"
+LABEL(SBETAZEROCOLSTOREDS2)
 "                                            \n\t"
 " fmla v8.4s, v14.4s,v6.s[0]                 \n\t" // Scale by alpha
 " fmla v9.4s, v15.4s,v6.s[0]                 \n\t" // Scale by alpha
@@ -597,7 +598,7 @@ __asm__ volatile
 " dup  v5.4s, wzr                            \n\t"
 "                                            \n\t"
 " fcmp s7,#0.0                               \n\t"
-" beq .SBETAZEROCOLSTOREDS3                  \n\t" // Taking care of the beta==0 case.
+BEQ(SBETAZEROCOLSTOREDS3)                          // Taking care of the beta==0 case.
 "                                            \n\t"
 " ldr q0, [x21]                              \n\t" //Load column 6 of C
 " ldr q1, [x21, #16]                         \n\t"
@@ -613,7 +614,7 @@ __asm__ volatile
 " fmul v4.4s,v4.4s,v7.s[0]                   \n\t" // Scale by beta
 " fmul v5.4s,v5.4s,v7.s[0]                   \n\t" // Scale by beta
 "                                            \n\t"
-" .SBETAZEROCOLSTOREDS3:                     \n\t"
+LABEL(SBETAZEROCOLSTOREDS3)
 "                                            \n\t"
 " fmla v0.4s,v20.4s,v6.s[0]                  \n\t" // Scale by alpha
 " fmla v1.4s,v21.4s,v6.s[0]                  \n\t" // Scale by alpha
@@ -637,7 +638,7 @@ __asm__ volatile
 " dup  v13.4s, wzr                            \n\t"
 "                                            \n\t"
 " fcmp s7,#0.0                               \n\t"
-" beq .SBETAZEROCOLSTOREDS4                  \n\t" // Taking care of the beta==0 case.
+BEQ(SBETAZEROCOLSTOREDS4)                          // Taking care of the beta==0 case.
 "                                            \n\t"
 " ldr q8, [x24]                              \n\t" //Load column 9 of C
 " ldr q9, [x24, #16]                         \n\t"
@@ -653,7 +654,7 @@ __asm__ volatile
 " fmul v12.4s,v12.4s,v7.s[0]                 \n\t" // Scale by beta
 " fmul v13.4s,v13.4s,v7.s[0]                 \n\t" // Scale by beta
 "                                            \n\t"
-" .SBETAZEROCOLSTOREDS4:                     \n\t"
+LABEL(SBETAZEROCOLSTOREDS4)
 "                                            \n\t"
 " prfm pldl2keep,[x3]                        \n\t"
 " prfm pldl2keep,[x4]                        \n\t"
@@ -673,10 +674,10 @@ __asm__ volatile
 " str q13, [x26, #16]                        \n\t"
 "                                            \n\t"
 "                                            \n\t"
-" b .SEND                                    \n\t" // Done (TODO: this obviously needs to be moved down to remove jump).
+BRANCH(SEND)                                       // Done.
 "                                            \n\t"
 "                                            \n\t"
-" .SGENSTORED:                               \n\t" // C is general-stride stored.
+LABEL(SGENSTORED)                                  // C is general-stride stored.
 "                                            \n\t"
 "                                            \n\t"
 " dup  v0.4s, wzr                            \n\t"
@@ -687,7 +688,7 @@ __asm__ volatile
 " dup  v5.4s, wzr                            \n\t"
 "                                            \n\t"
 " fcmp s7,#0.0                               \n\t"
-" beq .SBETAZEROGENSTOREDS1                  \n\t" // Taking care of the beta==0 case.
+BEQ(SBETAZEROGENSTOREDS1)                          // Taking care of the beta==0 case.
 "                                            \n\t"
 " mov x27, x2                                \n\t"
 "                                            \n\t"
@@ -729,7 +730,7 @@ __asm__ volatile
 " fmul v4.4s,v4.4s,v7.s[0]                   \n\t" // Scale by beta
 " fmul v5.4s,v5.4s,v7.s[0]                   \n\t" // Scale by beta
 "                                            \n\t"
-" .SBETAZEROGENSTOREDS1:                     \n\t"
+LABEL(SBETAZEROGENSTOREDS1)
 "                                            \n\t"
 " fmla v0.4s, v8.4s,v6.s[0]                  \n\t" // Scale by alpha
 " fmla v1.4s, v9.4s,v6.s[0]                  \n\t" // Scale by alpha
@@ -779,7 +780,7 @@ __asm__ volatile
 " dup  v13.4s, wzr                           \n\t"
 "                                            \n\t"
 " fcmp s7,#0.0                               \n\t"
-" beq .SBETAZEROGENSTOREDS2                  \n\t" // Taking care of the beta==0 case.
+BEQ(SBETAZEROGENSTOREDS2)                          // Taking care of the beta==0 case.
 "                                            \n\t"
 " mov x27, x18                               \n\t"
 "                                            \n\t"
@@ -821,7 +822,7 @@ __asm__ volatile
 " fmul v12.4s,v12.4s,v7.s[0]                 \n\t" // Scale by beta
 " fmul v13.4s,v13.4s,v7.s[0]                 \n\t" // Scale by beta
 "                                            \n\t"
-" .SBETAZEROGENSTOREDS2:                     \n\t"
+LABEL(SBETAZEROGENSTOREDS2)
 "                                            \n\t"
 " fmla v8.4s, v14.4s,v6.s[0]                 \n\t" // Scale by alpha
 " fmla v9.4s, v15.4s,v6.s[0]                 \n\t" // Scale by alpha
@@ -871,7 +872,7 @@ __asm__ volatile
 " dup  v5.4s, wzr                            \n\t"
 "                                            \n\t"
 " fcmp s7,#0.0                               \n\t"
-" beq .SBETAZEROGENSTOREDS3                  \n\t" // Taking care of the beta==0 case.
+BEQ(SBETAZEROGENSTOREDS3)                          // Taking care of the beta==0 case.
 "                                            \n\t"
 " mov x27, x21                               \n\t"
 "                                            \n\t"
@@ -913,7 +914,7 @@ __asm__ volatile
 " fmul v4.4s,v4.4s,v7.s[0]                   \n\t" // Scale by beta
 " fmul v5.4s,v5.4s,v7.s[0]                   \n\t" // Scale by beta
 "                                            \n\t"
-" .SBETAZEROGENSTOREDS3:                     \n\t"
+LABEL(SBETAZEROGENSTOREDS3)
 "                                            \n\t"
 " fmla v0.4s,v20.4s,v6.s[0]                  \n\t" // Scale by alpha
 " fmla v1.4s,v21.4s,v6.s[0]                  \n\t" // Scale by alpha
@@ -963,7 +964,7 @@ __asm__ volatile
 " dup  v13.4s, wzr                           \n\t"
 "                                            \n\t"
 " fcmp s7,#0.0                               \n\t"
-" beq .SBETAZEROGENSTOREDS4                  \n\t" // Taking care of the beta==0 case.
+BEQ(SBETAZEROGENSTOREDS4)                          // Taking care of the beta==0 case.
 "                                            \n\t"
 " mov x27, x24                               \n\t"
 "                                            \n\t"
@@ -1005,7 +1006,7 @@ __asm__ volatile
 " fmul v12.4s,v12.4s,v7.s[0]                 \n\t" // Scale by beta
 " fmul v13.4s,v13.4s,v7.s[0]                 \n\t" // Scale by beta
 "                                            \n\t"
-" .SBETAZEROGENSTOREDS4:                     \n\t"
+LABEL(SBETAZEROGENSTOREDS4)
 "                                            \n\t"
 " prfm pldl2keep,[x3]                        \n\t"
 " prfm pldl2keep,[x4]                        \n\t"
@@ -1050,7 +1051,7 @@ __asm__ volatile
 " st1 {v13.s}[2],[x27],x14                   \n\t" // Store c116  into quad and increment by rs_c.
 " st1 {v13.s}[3],[x27],x14                   \n\t" // Store c147  into quad and increment by rs_c.
 "                                            \n\t"
-" .SEND:                                     \n\t" // Done!
+LABEL(SEND)                                        // Done!
 "                                            \n\t"
 :// output operands (none)
 :// input operands
@@ -1203,7 +1204,7 @@ __asm__ volatile
 "                                            \n\t"
 "                                            \n\t"
 " cmp x5,#0                                  \n\t" // If k_iter == 0, jump to k_left.
-" beq .DCONSIDERKLEFT                        \n\t"
+BEQ(DCONSIDERKLEFT)
 "                                            \n\t"
 " ldr q0, [x0]                               \n\t" // Load a
 " ldr q1, [x0, #16]                          \n\t"
@@ -1218,9 +1219,9 @@ __asm__ volatile
 " add x1, x1, #64                            \n\t" //update address of B
 "                                            \n\t"
 " cmp x5,1                                   \n\t" // If there is just one k_iter, jump to that one. 
-" beq .DLASTITER                             \n\t" // (as loop is do-while-like).
+BEQ(DLASTITER)                                     // (as loop is do-while-like).
 "                                            \n\t"
-" DLOOP:                                     \n\t" // Body
+LABEL(DLOOP)                                       // Body
 "                                            \n\t"
 " fmla v8.2d ,v0.2d,v3.d[0]                  \n\t" // Accummulate
 " prfm    PLDL1KEEP, [x1, #448]              \n\t" //512-64=448
@@ -1394,9 +1395,9 @@ __asm__ volatile
 "                                            \n\t"
 " sub x5,x5,1                                \n\t" // i-=1
 " cmp x5,1                                   \n\t" // Iterate again if we are not in k_iter == 1.
-" bne DLOOP                                  \n\t"
+BNE(DLOOP)
 "                                            \n\t"
-".DLASTITER:                                 \n\t"
+LABEL(DLASTITER)
 "                                            \n\t"
 " fmla v8.2d ,v0.2d,v3.d[0]                  \n\t" // Accummulate
 " fmla v9.2d ,v1.2d,v3.d[0]                  \n\t" // Accummulate
@@ -1554,11 +1555,11 @@ __asm__ volatile
 "                                            \n\t"                  //End it 4
 " add x0, x0, #144                           \n\t"
 "                                            \n\t"
-" .DCONSIDERKLEFT:                           \n\t" 
+LABEL(DCONSIDERKLEFT)
 " cmp x6,0                                   \n\t" // If k_left == 0, we are done.
-" beq .DPOSTACCUM                            \n\t" // else, we enter the k_left loop.
+BEQ(DPOSTACCUM)                                    // else, we enter the k_left loop.
 "                                            \n\t"
-".DLOOPKLEFT:                                \n\t"
+LABEL(DLOOPKLEFT)
 "                                            \n\t"
 " ldr q0, [x0],#16                           \n\t"
 " ldr q1, [x0],#16                           \n\t" // Load a
@@ -1605,17 +1606,17 @@ __asm__ volatile
 " fmla v31.2d,v2.2d,v6.d[1]                  \n\t" // Accummulate
 "                                            \n\t"
 " cmp x6,0                                   \n\t" // Iterate again.
-" bne .DLOOPKLEFT                            \n\t" // if i!=0.
+BNE(DLOOPKLEFT)                                    // if i!=0.
 "                                            \n\t"
-" .DPOSTACCUM:                               \n\t"
+LABEL(DPOSTACCUM)
 "                                            \n\t"
 " ld1r {v6.2d},[x7]                          \n\t" // Load alpha.
 " ld1r {v7.2d},[x8]                          \n\t" // Load beta
 "                                            \n\t"
 " cmp x13,#1                                 \n\t" // If rs_c != 1 (column-major)
-" bne .DGENSTORED                            \n\t"
+BNE(DGENSTORED)
 "                                            \n\t"
-" .DCOLSTORED:                               \n\t" // C is column-major.
+LABEL(DCOLSTORED)                                  // C is column-major.
 "                                            \n\t"
 " dup  v0.2d, xzr                            \n\t"
 " dup  v1.2d, xzr                            \n\t"
@@ -1625,7 +1626,7 @@ __asm__ volatile
 " dup  v5.2d, xzr                            \n\t"
 "                                            \n\t"
 " fcmp d7,#0.0                               \n\t"
-" beq .DBETAZEROCOLSTOREDS1                  \n\t" // Taking care of the beta==0 case.
+BEQ(DBETAZEROCOLSTOREDS1)                          // Taking care of the beta==0 case.
 "                                            \n\t"
 " ldr q0, [x2]                               \n\t" //Load column 0 of C
 " ldr q1, [x2, #16]                          \n\t"
@@ -1642,7 +1643,7 @@ __asm__ volatile
 " fmul v4.2d,v4.2d,v7.d[0]                   \n\t" // Scale by beta
 " fmul v5.2d,v5.2d,v7.d[0]                   \n\t" // Scale by beta
 "                                            \n\t"
-" .DBETAZEROCOLSTOREDS1:                     \n\t"
+LABEL(DBETAZEROCOLSTOREDS1)
 "                                            \n\t"
 " fmla v0.2d,v8.2d,v6.d[0]                   \n\t" // Scale by alpha
 " fmla v1.2d,v9.2d,v6.d[0]                   \n\t" // Scale by alpha
@@ -1667,7 +1668,7 @@ __asm__ volatile
 " dup  v13.2d, xzr                           \n\t"
 "                                            \n\t"
 " fcmp d7,#0.0                               \n\t"
-" beq .DBETAZEROCOLSTOREDS2                  \n\t" // Taking care of the beta==0 case.
+BEQ(DBETAZEROCOLSTOREDS2)                          // Taking care of the beta==0 case.
 "                                            \n\t"
 " ldr q8, [x21]                              \n\t" //Load column 2 of C
 " ldr q9, [x21, #16]                         \n\t"
@@ -1684,7 +1685,7 @@ __asm__ volatile
 " fmul v12.2d,v12.2d,v7.d[0]                 \n\t" // Scale by beta
 " fmul v13.2d,v13.2d,v7.d[0]                 \n\t" // Scale by beta
 "                                            \n\t"
-" .DBETAZEROCOLSTOREDS2:                     \n\t"
+LABEL(DBETAZEROCOLSTOREDS2)
 "                                            \n\t"
 " fmla v8.2d, v14.2d,v6.d[0]                 \n\t" // Scale by alpha
 " fmla v9.2d, v15.2d,v6.d[0]                 \n\t" // Scale by alpha
@@ -1709,7 +1710,7 @@ __asm__ volatile
 " dup  v5.2d, xzr                            \n\t"
 "                                            \n\t"
 " fcmp d7,#0.0                               \n\t"
-" beq .DBETAZEROCOLSTOREDS3                  \n\t" // Taking care of the beta==0 case.
+BEQ(DBETAZEROCOLSTOREDS3)                          // Taking care of the beta==0 case.
 "                                            \n\t"
 " ldr q0, [x23]                              \n\t" //Load column 4 of C
 " ldr q1, [x23, #16]                         \n\t"
@@ -1726,7 +1727,7 @@ __asm__ volatile
 " fmul v4.2d,v4.2d,v7.d[0]                   \n\t" // Scale by beta
 " fmul v5.2d,v5.2d,v7.d[0]                   \n\t" // Scale by beta
 "                                            \n\t"
-" .DBETAZEROCOLSTOREDS3:                     \n\t"
+LABEL(DBETAZEROCOLSTOREDS3)
 "                                            \n\t"
 " fmla v0.2d,v20.2d,v6.d[0]                  \n\t" // Scale by alpha
 " fmla v1.2d,v21.2d,v6.d[0]                  \n\t" // Scale by alpha
@@ -1751,7 +1752,7 @@ __asm__ volatile
 " dup  v13.2d, xzr                           \n\t"
 "                                            \n\t"
 " fcmp d7,#0.0                               \n\t"
-" beq .DBETAZEROCOLSTOREDS4                  \n\t" // Taking care of the beta==0 case.
+BEQ(DBETAZEROCOLSTOREDS4)                          // Taking care of the beta==0 case.
 "                                            \n\t"
 " ldr q8, [x25]                              \n\t" //Load column 6 of C
 " ldr q9, [x25, #16]                         \n\t"
@@ -1768,7 +1769,7 @@ __asm__ volatile
 " fmul v12.2d,v12.2d,v7.d[0]                 \n\t" // Scale by beta
 " fmul v13.2d,v13.2d,v7.d[0]                 \n\t" // Scale by beta
 "                                            \n\t"
-" .DBETAZEROCOLSTOREDS4:                     \n\t"
+LABEL(DBETAZEROCOLSTOREDS4)
 "                                            \n\t"
 " prfm pldl2keep,[x3]                        \n\t"
 " prfm pldl2keep,[x4]                        \n\t"
@@ -1788,9 +1789,9 @@ __asm__ volatile
 " str q12, [x26, #16]                        \n\t"
 " str q13, [x26, #32]                        \n\t"
 "                                            \n\t"
-" b .DEND                                    \n\t"
+BRANCH(DEND)
 "                                            \n\t"
-" .DGENSTORED:                               \n\t" // C is general-stride stored.
+LABEL(DGENSTORED)                                  // C is general-stride stored.
 "                                            \n\t"
 " dup  v0.2d, xzr                            \n\t"
 " dup  v1.2d, xzr                            \n\t"
@@ -1800,7 +1801,7 @@ __asm__ volatile
 " dup  v5.2d, xzr                            \n\t"
 "                                            \n\t"
 " fcmp d7,#0.0                               \n\t"
-" beq .DBETAZEROGENSTOREDS1                  \n\t" // Taking care of the beta==0 case.
+BEQ(DBETAZEROGENSTOREDS1)                          // Taking care of the beta==0 case.
 "                                            \n\t"
 " mov x27, x2                                \n\t"
 "                                            \n\t" // Load address of C.
@@ -1827,7 +1828,7 @@ __asm__ volatile
 " fmul v4.2d,v4.2d,v7.d[0]                   \n\t" // Scale by beta
 " fmul v5.2d,v5.2d,v7.d[0]                   \n\t" // Scale by beta
 "                                            \n\t"
-" .DBETAZEROGENSTOREDS1:                     \n\t"
+LABEL(DBETAZEROGENSTOREDS1)
 "                                            \n\t"
 " fmla v0.2d,v8.2d,v6.d[0]                   \n\t" // Scale by alpha
 " fmla v1.2d,v9.2d,v6.d[0]                   \n\t" // Scale by alpha
@@ -1862,7 +1863,7 @@ __asm__ volatile
 " dup  v13.2d, xzr                           \n\t"
 "                                            \n\t"
 " fcmp d7,#0.0                               \n\t"
-" beq .DBETAZEROGENSTOREDS2                  \n\t" // Taking care of the beta==0 case.
+BEQ(DBETAZEROGENSTOREDS2)                          // Taking care of the beta==0 case.
 "                                            \n\t"
 " mov x27, x21                               \n\t" // Load address of C.
 "                                            \n\t"
@@ -1889,7 +1890,7 @@ __asm__ volatile
 " fmul v12.2d,v12.2d,v7.d[0]                 \n\t" // Scale by beta
 " fmul v13.2d,v13.2d,v7.d[0]                 \n\t" // Scale by beta
 "                                            \n\t"
-" .DBETAZEROGENSTOREDS2:                     \n\t"
+LABEL(DBETAZEROGENSTOREDS2)
 "                                            \n\t"
 " fmla v8.2d, v14.2d,v6.d[0]                 \n\t" // Scale by alpha
 " fmla v9.2d, v15.2d,v6.d[0]                 \n\t" // Scale by alpha
@@ -1924,7 +1925,7 @@ __asm__ volatile
 " dup  v5.2d, xzr                            \n\t"
 "                                            \n\t"
 " fcmp d7,#0.0                               \n\t"
-" beq .DBETAZEROGENSTOREDS3                  \n\t" // Taking care of the beta==0 case.
+BEQ(DBETAZEROGENSTOREDS3)                          // Taking care of the beta==0 case.
 "                                            \n\t"
 " mov x27, x23                               \n\t" // Load address of C.
 "                                            \n\t"
@@ -1951,7 +1952,7 @@ __asm__ volatile
 " fmul v4.2d,v4.2d,v7.d[0]                   \n\t" // Scale by beta
 " fmul v5.2d,v5.2d,v7.d[0]                   \n\t" // Scale by beta
 "                                            \n\t"
-" .DBETAZEROGENSTOREDS3:                     \n\t"
+LABEL(DBETAZEROGENSTOREDS3)
 "                                            \n\t"
 " fmla v0.2d,v20.2d,v6.d[0]                  \n\t" // Scale by alpha
 " fmla v1.2d,v21.2d,v6.d[0]                  \n\t" // Scale by alpha
@@ -1986,7 +1987,7 @@ __asm__ volatile
 " dup  v13.2d, xzr                           \n\t"
 "                                            \n\t"
 " fcmp d7,#0.0                               \n\t"
-" beq .DBETAZEROGENSTOREDS4                  \n\t" // Taking care of the beta==0 case.
+BEQ(DBETAZEROGENSTOREDS4)                          // Taking care of the beta==0 case.
 "                                            \n\t"
 " mov x27, x25                               \n\t"
 "                                            \n\t"
@@ -2013,7 +2014,7 @@ __asm__ volatile
 " fmul v12.2d,v12.2d,v7.d[0]                 \n\t" // Scale by beta
 " fmul v13.2d,v13.2d,v7.d[0]                 \n\t" // Scale by beta
 "                                            \n\t"
-" .DBETAZEROGENSTOREDS4:                     \n\t"
+LABEL(DBETAZEROGENSTOREDS4)
 "                                            \n\t"
 " prfm pldl2keep,[x3]                        \n\t"
 " prfm pldl2keep,[x4]                        \n\t"
@@ -2043,7 +2044,7 @@ __asm__ volatile
 " st1 {v13.d}[0],[x27],x14                   \n\t" // Store c74  into quad and increment by rs_c.
 " st1 {v13.d}[1],[x27],x14                   \n\t" // Store c75  into quad and increment by rs_c.
 "                                            \n\t"
-" .DEND:                                     \n\t" // Done!
+LABEL(DEND)                                        // Done!
 "                                            \n\t"
 :// output operands (none)
 :// input operands

From 916e1fa8be3cea0e3e2a4a7e8b00027ac2ee7780 Mon Sep 17 00:00:00 2001
From: RuQing Xu <r-xu@g.ecc.u-tokyo.ac.jp>
Date: Sat, 29 May 2021 16:46:52 +0900
Subject: [PATCH 2/4] Armv8A Rename Regs for Clang Compile: FP64 Part

- x7, x8: Used to store address for Alpha and Beta.
  As Alpha & Beta was not used in k-loops, use x0, x1 to load
  Alpha & Beta's addresses after k-loops are completed, since A & B's
  addresses are no longer needed there.
  This "ldr [addr]; -> ldr val, [addr]" would not cause much performance
  drawback since it is done outside k-loops and there are plenty of
  instructions between Alpha & Beta's loading and usage.
- x9: Used to store cs_c. x9 is multiplied by 8 into x10 and not used
  any longer. Directly loading cs_c and into x10 and scale by 8 spares
  x9 straightforwardly.
- x11, x12: Not used at all. Simply remove from clobber list.
- x13: Alike x9, loaded and scaled by 8 into x14, except that x13 is
  also used in a conditional branch so that "cmp x13, #1" needs to be
  modified into "cmp x14, #8" to completely free x13.
- x3, x4: Used to store next_a & next_b. Untouched in k-loops. Load
  these addresses into x0 and x1 after Alpha & Beta are both loaded,
  since then neigher address of A/B nor address of Alpha/Beta is needed.
---
 kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c | 44 ++++++++++-----------
 1 file changed, 21 insertions(+), 23 deletions(-)

diff --git a/kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c b/kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c
index 251931f7c..279b61b79 100644
--- a/kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c
+++ b/kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c
@@ -1135,20 +1135,14 @@ __asm__ volatile
 " ldr x1,%[baddr]                            \n\t" // Load address of B
 " ldr x2,%[caddr]                            \n\t" // Load address of C
 "                                            \n\t"
-" ldr x3,%[a_next]                           \n\t" // Move pointer
-" ldr x4,%[b_next]                           \n\t" // Move pointer
-"                                            \n\t"
 " ldr x5,%[k_iter]                           \n\t" // Init guard (k_iter)
 " ldr x6,%[k_left]                           \n\t" // Init guard (k_iter)
 "                                            \n\t" 
-" ldr x7,%[alpha]                            \n\t" // Alpha address      
-" ldr x8,%[beta]                             \n\t" // Beta address      
-"                                            \n\t" 
-" ldr x9,%[cs_c]                             \n\t" // Load cs_c
-" lsl x10,x9,#3                              \n\t" // cs_c * sizeof(double)
+" ldr x10,%[cs_c]                            \n\t" // Load cs_c
+" lsl x10,x10,#3                             \n\t" // cs_c * sizeof(double)
 "                                            \n\t"
-" ldr x13,%[rs_c]                            \n\t" // Load rs_c.
-" lsl x14,x13,#3                             \n\t" // rs_c * sizeof(double). 
+" ldr x14,%[rs_c]                            \n\t" // Load rs_c.
+" lsl x14,x14,#3                             \n\t" // rs_c * sizeof(double). 
 "                                            \n\t"
 " add x20,x2,x10                             \n\t" //Load address Column 1 of C
 " add x21,x20,x10                            \n\t" //Load address Column 2 of C
@@ -1610,10 +1604,16 @@ BNE(DLOOPKLEFT)                                    // if i!=0.
 "                                            \n\t"
 LABEL(DPOSTACCUM)
 "                                            \n\t"
-" ld1r {v6.2d},[x7]                          \n\t" // Load alpha.
-" ld1r {v7.2d},[x8]                          \n\t" // Load beta
+" ldr x0,%[alpha]                            \n\t" // Alpha address      
+" ldr x1,%[beta]                             \n\t" // Beta address      
+"                                            \n\t" 
+" ld1r {v6.2d},[x0]                          \n\t" // Load alpha.
+" ld1r {v7.2d},[x1]                          \n\t" // Load beta
 "                                            \n\t"
-" cmp x13,#1                                 \n\t" // If rs_c != 1 (column-major)
+" ldr x0,%[a_next]                           \n\t" // Next A address for later use.
+" ldr x1,%[b_next]                           \n\t" // Next B address for later use.
+"                                            \n\t"
+" cmp x14,#8                                 \n\t" // If rs_c != 1 (column-major)
 BNE(DGENSTORED)
 "                                            \n\t"
 LABEL(DCOLSTORED)                                  // C is column-major.
@@ -1771,8 +1771,8 @@ BEQ(DBETAZEROCOLSTOREDS4)                          // Taking care of the beta==0
 "                                            \n\t"
 LABEL(DBETAZEROCOLSTOREDS4)
 "                                            \n\t"
-" prfm pldl2keep,[x3]                        \n\t"
-" prfm pldl2keep,[x4]                        \n\t"
+" prfm pldl2keep,[x0]                        \n\t"
+" prfm pldl2keep,[x1]                        \n\t"
 "                                            \n\t"
 " fmla v8.2d, v26.2d,v6.d[0]                 \n\t" // Scale by alpha
 " fmla v9.2d, v27.2d,v6.d[0]                 \n\t" // Scale by alpha
@@ -2016,8 +2016,8 @@ BEQ(DBETAZEROGENSTOREDS4)                          // Taking care of the beta==0
 "                                            \n\t"
 LABEL(DBETAZEROGENSTOREDS4)
 "                                            \n\t"
-" prfm pldl2keep,[x3]                        \n\t"
-" prfm pldl2keep,[x4]                        \n\t"
+" prfm pldl2keep,[x0]                        \n\t"
+" prfm pldl2keep,[x1]                        \n\t"
 "                                            \n\t"
 " fmla v8.2d, v26.2d,v6.d[0]                 \n\t" // Scale by alpha
 " fmla v9.2d, v27.2d,v6.d[0]                 \n\t" // Scale by alpha
@@ -2060,12 +2060,10 @@ LABEL(DEND)                                        // Done!
  [a_next] "m" (a_next), // 8
  [b_next] "m" (b_next)  // 9
 :// Register clobber list
- "x0","x1","x2","x3",
- "x4","x5","x6",
- "x7","x8","x9",
- "x10","x11","x12","x13","x14","x16","x17",
- "x20","x21","x22","x23","x24","x25","x26",
- "x27",       
+ "x0","x1","x2",
+ "x5","x6","x10",
+ "x14","x16","x17",
+ "x20","x21","x22","x23","x24","x25","x26","x27",
  "v0","v1","v2",
  "v3","v4","v5",
  "v6","v7","v8",

From 9f4a4a3cfb2244e4024445e127dafd2a11f39fc5 Mon Sep 17 00:00:00 2001
From: RuQing Xu <r-xu@g.ecc.u-tokyo.ac.jp>
Date: Sat, 29 May 2021 17:21:28 +0900
Subject: [PATCH 3/4] Armv8A Rename Regs for Clang Compile: FP32 Part

Roughly the same as 916e1fa , additionally with x15 clobbering removed.
- x15: Not used at all.

Compilation w/ Clang shows warning about x18 reservation, but
compilation itself is OK and all tests got passed.
---
 kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c | 41 ++++++++++-----------
 1 file changed, 20 insertions(+), 21 deletions(-)

diff --git a/kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c b/kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c
index 279b61b79..be5e20ae7 100644
--- a/kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c
+++ b/kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c
@@ -82,20 +82,14 @@ __asm__ volatile
 " ldr x1,%[baddr]                            \n\t" // Load address of B.
 " ldr x2,%[caddr]                            \n\t" // Load address of C.
 "                                            \n\t"
-" ldr x3,%[a_next]                           \n\t" // Pointer to next block of A.
-" ldr x4,%[b_next]                           \n\t" // Pointer to next pointer of B.
-"                                            \n\t"
 " ldr x5,%[k_iter]                           \n\t" // Number of unrolled iterations (k_iter).
 " ldr x6,%[k_left]                           \n\t" // Number of remaining iterations (k_left).
 "                                            \n\t" 
-" ldr x7,%[alpha]                            \n\t" // Alpha address.      
-" ldr x8,%[beta]                             \n\t" // Beta address.     
+" ldr x10,%[cs_c]                            \n\t" // Load cs_c.
+" lsl x10,x10,#2                             \n\t" // cs_c * sizeof(float) -- AUX.
 "                                            \n\t" 
-" ldr x9,%[cs_c]                             \n\t" // Load cs_c.
-" lsl x10,x9,#2                              \n\t" // cs_c * sizeof(float) -- AUX.
-"                                            \n\t" 
-" ldr x13,%[rs_c]                            \n\t" // Load rs_c.
-" lsl x14,x13,#2                             \n\t" // rs_c * sizeof(float).
+" ldr x14,%[rs_c]                            \n\t" // Load rs_c.
+" lsl x14,x14,#2                             \n\t" // rs_c * sizeof(float).
 "                                            \n\t"
 " add x16,x2,x10                             \n\t" //Load address Column 1 of C
 " add x17,x16,x10                            \n\t" //Load address Column 2 of C
@@ -502,10 +496,16 @@ BNE(SLOOPKLEFT)                                    // if i!=0.
 "                                            \n\t"
 LABEL(SPOSTACCUM)
 "                                            \n\t"
-" ld1r {v6.4s},[x7]                          \n\t" // Load alpha.
-" ld1r {v7.4s},[x8]                          \n\t" // Load beta
+" ldr x0,%[alpha]                            \n\t" // Alpha address.
+" ldr x1,%[beta]                             \n\t" // Beta address.
 "                                            \n\t"
-" cmp x13,#1                                 \n\t" // If rs_c != 1 (column-major)
+" ld1r {v6.4s},[x0]                          \n\t" // Load alpha.
+" ld1r {v7.4s},[x1]                          \n\t" // Load beta
+"                                            \n\t"
+" ldr x0,%[a_next]                           \n\t" // Pointer to next block of A.
+" ldr x1,%[b_next]                           \n\t" // Pointer to next pointer of B.
+"                                            \n\t"
+" cmp x14,#4                                 \n\t" // If rs_c != 1 (column-major)
 BNE(SGENSTORED)
 "                                            \n\t"
 LABEL(SCOLSTORED)                                  // C is column-major.
@@ -656,8 +656,8 @@ BEQ(SBETAZEROCOLSTOREDS4)                          // Taking care of the beta==0
 "                                            \n\t"
 LABEL(SBETAZEROCOLSTOREDS4)
 "                                            \n\t"
-" prfm pldl2keep,[x3]                        \n\t"
-" prfm pldl2keep,[x4]                        \n\t"
+" prfm pldl2keep,[x0]                        \n\t"
+" prfm pldl2keep,[x1]                        \n\t"
 "                                            \n\t"
 " fmla v8.4s, v26.4s,v6.s[0]                 \n\t" // Scale by alpha
 " fmla v9.4s, v27.4s,v6.s[0]                 \n\t" // Scale by alpha
@@ -1008,8 +1008,8 @@ BEQ(SBETAZEROGENSTOREDS4)                          // Taking care of the beta==0
 "                                            \n\t"
 LABEL(SBETAZEROGENSTOREDS4)
 "                                            \n\t"
-" prfm pldl2keep,[x3]                        \n\t"
-" prfm pldl2keep,[x4]                        \n\t"
+" prfm pldl2keep,[x0]                        \n\t"
+" prfm pldl2keep,[x1]                        \n\t"
 "                                            \n\t"
 " fmla v8.4s, v26.4s,v6.s[0]                 \n\t" // Scale by alpha
 " fmla v9.4s, v27.4s,v6.s[0]                 \n\t" // Scale by alpha
@@ -1067,10 +1067,9 @@ LABEL(SEND)                                        // Done!
  [a_next] "m" (a_next), // 9
  [b_next] "m" (b_next) // 10
 :// Register clobber list
- "x0", "x1", "x2","x3","x4",
- "x5", "x6", "x7", "x8",
- "x9", "x10","x11","x12",
- "x13","x14","x15",
+ "x0", "x1", "x2",
+ "x5", "x6", "x10",
+ "x14",
  "x16","x17","x18","x19",       
  "x20","x21","x22","x23",
  "x24","x25","x26","x27",

From 5fc93e280614b4a21a9cff36cf873b4b9407285b Mon Sep 17 00:00:00 2001
From: RuQing Xu <r-xu@g.ecc.u-tokyo.ac.jp>
Date: Sat, 29 May 2021 18:44:47 +0900
Subject: [PATCH 4/4] Armv8A Rename Regs for Safe Darwin Compile

Avoid x18 use in FP32 kernel:
- C address lines x[18-26] renamed to x[19-27] (reg index +1)
- Original role of x27 fulfilled by x5 which is free after k-loop pert.

FP64 does not require changing since x18 is not used there.
---
 kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c | 533 ++++++++++----------
 1 file changed, 266 insertions(+), 267 deletions(-)

diff --git a/kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c b/kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c
index be5e20ae7..dfdda863b 100644
--- a/kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c
+++ b/kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c
@@ -93,20 +93,19 @@ __asm__ volatile
 "                                            \n\t"
 " add x16,x2,x10                             \n\t" //Load address Column 1 of C
 " add x17,x16,x10                            \n\t" //Load address Column 2 of C
-" add x18,x17,x10                            \n\t" //Load address Column 3 of C
-" add x19,x18,x10                            \n\t" //Load address Column 4 of C
-" add x20,x19,x10                            \n\t" //Load address Column 5 of C
-" add x21,x20,x10                            \n\t" //Load address Column 6 of C
-" add x22,x21,x10                            \n\t" //Load address Column 7 of C
-" add x23,x22,x10                            \n\t" //Load address Column 8 of C
-" add x24,x23,x10                            \n\t" //Load address Column 9 of C
-" add x25,x24,x10                            \n\t" //Load address Column 10 of C
-" add x26,x25,x10                            \n\t" //Load address Column 11 of C
+" add x19,x17,x10                            \n\t" //Load address Column 3 of C
+" add x20,x19,x10                            \n\t" //Load address Column 4 of C
+" add x21,x20,x10                            \n\t" //Load address Column 5 of C
+" add x22,x21,x10                            \n\t" //Load address Column 6 of C
+" add x23,x22,x10                            \n\t" //Load address Column 7 of C
+" add x24,x23,x10                            \n\t" //Load address Column 8 of C
+" add x25,x24,x10                            \n\t" //Load address Column 9 of C
+" add x26,x25,x10                            \n\t" //Load address Column 10 of C
+" add x27,x26,x10                            \n\t" //Load address Column 11 of C
 "                                            \n\t"
 " prfm pldl1keep,[x2]                        \n\t" // Prefetch c.
 " prfm pldl1keep,[x16]                       \n\t" // Prefetch c.
 " prfm pldl1keep,[x17]                       \n\t" // Prefetch c.
-" prfm pldl1keep,[x18]                       \n\t" // Prefetch c.
 " prfm pldl1keep,[x19]                       \n\t" // Prefetch c.
 " prfm pldl1keep,[x20]                       \n\t" // Prefetch c.
 " prfm pldl1keep,[x21]                       \n\t" // Prefetch c.
@@ -115,6 +114,7 @@ __asm__ volatile
 " prfm pldl1keep,[x24]                       \n\t" // Prefetch c.
 " prfm pldl1keep,[x25]                       \n\t" // Prefetch c.
 " prfm pldl1keep,[x26]                       \n\t" // Prefetch c.
+" prfm pldl1keep,[x27]                       \n\t" // Prefetch c.
 "                                            \n\t"
 " dup  v8.4s, wzr                            \n\t" // Vector for accummulating column 0
 " prfm    PLDL1KEEP, [x1, #192]              \n\t" 
@@ -560,12 +560,12 @@ LABEL(SBETAZEROCOLSTOREDS1)
 " fcmp s7,#0.0                               \n\t"
 BEQ(SBETAZEROCOLSTOREDS2)                          // Taking care of the beta==0 case.
 "                                            \n\t"
-" ldr q8, [x18]                              \n\t" //Load column 3 of C
-" ldr q9, [x18, #16]                         \n\t"
-" ldr q10, [x19]                             \n\t" //Load column 4 of C
-" ldr q11, [x19, #16]                        \n\t"
-" ldr q12, [x20]                             \n\t" //Load column 5 of C
-" ldr q13, [x20, #16]                        \n\t"
+" ldr q8, [x19]                              \n\t" //Load column 3 of C
+" ldr q9, [x19, #16]                         \n\t"
+" ldr q10, [x20]                             \n\t" //Load column 4 of C
+" ldr q11, [x20, #16]                        \n\t"
+" ldr q12, [x21]                             \n\t" //Load column 5 of C
+" ldr q13, [x21, #16]                        \n\t"
 "                                            \n\t"
 " fmul v8.4s, v8.4s, v7.s[0]                 \n\t" // Scale by beta
 " fmul v9.4s, v9.4s, v7.s[0]                 \n\t" // Scale by beta
@@ -583,12 +583,12 @@ LABEL(SBETAZEROCOLSTOREDS2)
 " fmla v12.4s,v18.4s,v6.s[0]                 \n\t" // Scale by alpha
 " fmla v13.4s,v19.4s,v6.s[0]                 \n\t" // Scale by alpha
 "                                            \n\t"
-" str q8, [x18]                              \n\t" //Store column 3 of C
-" str q9, [x18, #16]                         \n\t"
-" str q10, [x19]                             \n\t" //Store column 4 of C
-" str q11, [x19, #16]                        \n\t"
-" str q12, [x20]                             \n\t" //Store column 5 of C
-" str q13, [x20, #16]                        \n\t"
+" str q8, [x19]                              \n\t" //Store column 3 of C
+" str q9, [x19, #16]                         \n\t"
+" str q10, [x20]                             \n\t" //Store column 4 of C
+" str q11, [x20, #16]                        \n\t"
+" str q12, [x21]                             \n\t" //Store column 5 of C
+" str q13, [x21, #16]                        \n\t"
 "                                            \n\t"
 " dup  v0.4s, wzr                            \n\t"
 " dup  v1.4s, wzr                            \n\t"
@@ -600,12 +600,12 @@ LABEL(SBETAZEROCOLSTOREDS2)
 " fcmp s7,#0.0                               \n\t"
 BEQ(SBETAZEROCOLSTOREDS3)                          // Taking care of the beta==0 case.
 "                                            \n\t"
-" ldr q0, [x21]                              \n\t" //Load column 6 of C
-" ldr q1, [x21, #16]                         \n\t"
-" ldr q2, [x22]                              \n\t" //Load column 7 of C
-" ldr q3, [x22, #16]                         \n\t"
-" ldr q4, [x23]                              \n\t" //Load column 8 of C
-" ldr q5, [x23, #16]                         \n\t"
+" ldr q0, [x22]                              \n\t" //Load column 6 of C
+" ldr q1, [x22, #16]                         \n\t"
+" ldr q2, [x23]                              \n\t" //Load column 7 of C
+" ldr q3, [x23, #16]                         \n\t"
+" ldr q4, [x24]                              \n\t" //Load column 8 of C
+" ldr q5, [x24, #16]                         \n\t"
 "                                            \n\t"
 " fmul v0.4s,v0.4s,v7.s[0]                   \n\t" // Scale by beta
 " fmul v1.4s,v1.4s,v7.s[0]                   \n\t" // Scale by beta
@@ -623,12 +623,12 @@ LABEL(SBETAZEROCOLSTOREDS3)
 " fmla v4.4s,v24.4s,v6.s[0]                  \n\t" // Scale by alpha
 " fmla v5.4s,v25.4s,v6.s[0]                  \n\t" // Scale by alpha
 "                                            \n\t"
-" str q0, [x21]                              \n\t" //Store column 6 of C
-" str q1, [x21, #16]                         \n\t"
-" str q2, [x22]                              \n\t" //Store column 7 of C
-" str q3, [x22, #16]                         \n\t"
-" str q4, [x23]                              \n\t" //Store column 8 of C
-" str q5, [x23, #16]                         \n\t"
+" str q0, [x22]                              \n\t" //Store column 6 of C
+" str q1, [x22, #16]                         \n\t"
+" str q2, [x23]                              \n\t" //Store column 7 of C
+" str q3, [x23, #16]                         \n\t"
+" str q4, [x24]                              \n\t" //Store column 8 of C
+" str q5, [x24, #16]                         \n\t"
 "                                            \n\t"
 " dup  v8.4s, wzr                            \n\t"
 " dup  v9.4s, wzr                            \n\t"
@@ -640,12 +640,12 @@ LABEL(SBETAZEROCOLSTOREDS3)
 " fcmp s7,#0.0                               \n\t"
 BEQ(SBETAZEROCOLSTOREDS4)                          // Taking care of the beta==0 case.
 "                                            \n\t"
-" ldr q8, [x24]                              \n\t" //Load column 9 of C
-" ldr q9, [x24, #16]                         \n\t"
-" ldr q10, [x25]                             \n\t" //Load column 10 of C
-" ldr q11, [x25, #16]                        \n\t"
-" ldr q12, [x26]                             \n\t" //Load column 11 of C
-" ldr q13, [x26, #16]                        \n\t"
+" ldr q8, [x25]                              \n\t" //Load column 9 of C
+" ldr q9, [x25, #16]                         \n\t"
+" ldr q10, [x26]                             \n\t" //Load column 10 of C
+" ldr q11, [x26, #16]                        \n\t"
+" ldr q12, [x27]                             \n\t" //Load column 11 of C
+" ldr q13, [x27, #16]                        \n\t"
 "                                            \n\t"
 " fmul v8.4s, v8.4s, v7.s[0]                 \n\t" // Scale by beta
 " fmul v9.4s, v9.4s, v7.s[0]                 \n\t" // Scale by beta
@@ -666,12 +666,12 @@ LABEL(SBETAZEROCOLSTOREDS4)
 " fmla v12.4s,v30.4s,v6.s[0]                 \n\t" // Scale by alpha
 " fmla v13.4s,v31.4s,v6.s[0]                 \n\t" // Scale by alpha
 "                                            \n\t"
-" str q8, [x24]                              \n\t" //Store column 9 of C
-" str q9, [x24, #16]                         \n\t"
-" str q10, [x25]                             \n\t" //Store column 10 of C
-" str q11, [x25, #16]                        \n\t"
-" str q12, [x26]                             \n\t" //Store column 11 of C
-" str q13, [x26, #16]                        \n\t"
+" str q8, [x25]                              \n\t" //Store column 9 of C
+" str q9, [x25, #16]                         \n\t"
+" str q10, [x26]                             \n\t" //Store column 10 of C
+" str q11, [x26, #16]                        \n\t"
+" str q12, [x27]                             \n\t" //Store column 11 of C
+" str q13, [x27, #16]                        \n\t"
 "                                            \n\t"
 "                                            \n\t"
 BRANCH(SEND)                                       // Done.
@@ -690,38 +690,38 @@ LABEL(SGENSTORED)                                  // C is general-stride stored
 " fcmp s7,#0.0                               \n\t"
 BEQ(SBETAZEROGENSTOREDS1)                          // Taking care of the beta==0 case.
 "                                            \n\t"
-" mov x27, x2                                \n\t"
+" mov x5, x2                                 \n\t"
 "                                            \n\t"
-" ld1 {v0.s}[0],[x27],x14                    \n\t" // Load c00  into quad and increment by rs_c.
-" ld1 {v0.s}[1],[x27],x14                    \n\t" // Load c01  into quad and increment by rs_c.
-" ld1 {v0.s}[2],[x27],x14                    \n\t" // Load c02  into quad and increment by rs_c.
-" ld1 {v0.s}[3],[x27],x14                    \n\t" // Load c03  into quad and increment by rs_c.
-" ld1 {v1.s}[0],[x27],x14                    \n\t" // Load c04  into quad and increment by rs_c.
-" ld1 {v1.s}[1],[x27],x14                    \n\t" // Load c05  into quad and increment by rs_c.
-" ld1 {v1.s}[2],[x27],x14                    \n\t" // Load c06  into quad and increment by rs_c.
-" ld1 {v1.s}[3],[x27],x14                    \n\t" // Load c07  into quad and increment by rs_c.
+" ld1 {v0.s}[0],[x5],x14                     \n\t" // Load c00  into quad and increment by rs_c.
+" ld1 {v0.s}[1],[x5],x14                     \n\t" // Load c01  into quad and increment by rs_c.
+" ld1 {v0.s}[2],[x5],x14                     \n\t" // Load c02  into quad and increment by rs_c.
+" ld1 {v0.s}[3],[x5],x14                     \n\t" // Load c03  into quad and increment by rs_c.
+" ld1 {v1.s}[0],[x5],x14                     \n\t" // Load c04  into quad and increment by rs_c.
+" ld1 {v1.s}[1],[x5],x14                     \n\t" // Load c05  into quad and increment by rs_c.
+" ld1 {v1.s}[2],[x5],x14                     \n\t" // Load c06  into quad and increment by rs_c.
+" ld1 {v1.s}[3],[x5],x14                     \n\t" // Load c07  into quad and increment by rs_c.
 "                                            \n\t"
-" mov x27, x16                               \n\t"
+" mov x5, x16                                \n\t"
 "                                            \n\t"
-" ld1 {v2.s}[0],[x27],x14                    \n\t" // Load c10  into quad and increment by rs_c.
-" ld1 {v2.s}[1],[x27],x14                    \n\t" // Load c11  into quad and increment by rs_c.
-" ld1 {v2.s}[2],[x27],x14                    \n\t" // Load c12  into quad and increment by rs_c.
-" ld1 {v2.s}[3],[x27],x14                    \n\t" // Load c13  into quad and increment by rs_c.
-" ld1 {v3.s}[0],[x27],x14                    \n\t" // Load c14  into quad and increment by rs_c.
-" ld1 {v3.s}[1],[x27],x14                    \n\t" // Load c15  into quad and increment by rs_c.
-" ld1 {v3.s}[2],[x27],x14                    \n\t" // Load c16  into quad and increment by rs_c.
-" ld1 {v3.s}[3],[x27],x14                    \n\t" // Load c17  into quad and increment by rs_c.
+" ld1 {v2.s}[0],[x5],x14                     \n\t" // Load c10  into quad and increment by rs_c.
+" ld1 {v2.s}[1],[x5],x14                     \n\t" // Load c11  into quad and increment by rs_c.
+" ld1 {v2.s}[2],[x5],x14                     \n\t" // Load c12  into quad and increment by rs_c.
+" ld1 {v2.s}[3],[x5],x14                     \n\t" // Load c13  into quad and increment by rs_c.
+" ld1 {v3.s}[0],[x5],x14                     \n\t" // Load c14  into quad and increment by rs_c.
+" ld1 {v3.s}[1],[x5],x14                     \n\t" // Load c15  into quad and increment by rs_c.
+" ld1 {v3.s}[2],[x5],x14                     \n\t" // Load c16  into quad and increment by rs_c.
+" ld1 {v3.s}[3],[x5],x14                     \n\t" // Load c17  into quad and increment by rs_c.
 "                                            \n\t"
-" mov x27, x17                               \n\t"
+" mov x5, x17                                \n\t"
 "                                            \n\t"
-" ld1 {v4.s}[0],[x27],x14                    \n\t" // Load c20  into quad and increment by rs_c.
-" ld1 {v4.s}[1],[x27],x14                    \n\t" // Load c21  into quad and increment by rs_c.
-" ld1 {v4.s}[2],[x27],x14                    \n\t" // Load c22  into quad and increment by rs_c.
-" ld1 {v4.s}[3],[x27],x14                    \n\t" // Load c23  into quad and increment by rs_c.
-" ld1 {v5.s}[0],[x27],x14                    \n\t" // Load c24  into quad and increment by rs_c.
-" ld1 {v5.s}[1],[x27],x14                    \n\t" // Load c25  into quad and increment by rs_c.
-" ld1 {v5.s}[2],[x27],x14                    \n\t" // Load c26  into quad and increment by rs_c.
-" ld1 {v5.s}[3],[x27],x14                    \n\t" // Load c27  into quad and increment by rs_c.
+" ld1 {v4.s}[0],[x5],x14                     \n\t" // Load c20  into quad and increment by rs_c.
+" ld1 {v4.s}[1],[x5],x14                     \n\t" // Load c21  into quad and increment by rs_c.
+" ld1 {v4.s}[2],[x5],x14                     \n\t" // Load c22  into quad and increment by rs_c.
+" ld1 {v4.s}[3],[x5],x14                     \n\t" // Load c23  into quad and increment by rs_c.
+" ld1 {v5.s}[0],[x5],x14                     \n\t" // Load c24  into quad and increment by rs_c.
+" ld1 {v5.s}[1],[x5],x14                     \n\t" // Load c25  into quad and increment by rs_c.
+" ld1 {v5.s}[2],[x5],x14                     \n\t" // Load c26  into quad and increment by rs_c.
+" ld1 {v5.s}[3],[x5],x14                     \n\t" // Load c27  into quad and increment by rs_c.
 "                                            \n\t"
 " fmul v0.4s,v0.4s,v7.s[0]                   \n\t" // Scale by beta
 " fmul v1.4s,v1.4s,v7.s[0]                   \n\t" // Scale by beta
@@ -739,38 +739,38 @@ LABEL(SBETAZEROGENSTOREDS1)
 " fmla v4.4s,v12.4s,v6.s[0]                  \n\t" // Scale by alpha
 " fmla v5.4s,v13.4s,v6.s[0]                  \n\t" // Scale by alpha
 "                                            \n\t"
-" mov x27, x2                                \n\t"
+" mov x5, x2                                 \n\t"
 "                                            \n\t"
-" st1 {v0.s}[0],[x27],x14                    \n\t" // Store c00  into quad and increment by rs_c.
-" st1 {v0.s}[1],[x27],x14                    \n\t" // Store c01  into quad and increment by rs_c.
-" st1 {v0.s}[2],[x27],x14                    \n\t" // Store c02  into quad and increment by rs_c.
-" st1 {v0.s}[3],[x27],x14                    \n\t" // Store c03  into quad and increment by rs_c.
-" st1 {v1.s}[0],[x27],x14                    \n\t" // Store c04  into quad and increment by rs_c.
-" st1 {v1.s}[1],[x27],x14                    \n\t" // Store c05  into quad and increment by rs_c.
-" st1 {v1.s}[2],[x27],x14                    \n\t" // Store c06  into quad and increment by rs_c.
-" st1 {v1.s}[3],[x27],x14                    \n\t" // Store c07  into quad and increment by rs_c.
+" st1 {v0.s}[0],[x5],x14                     \n\t" // Store c00  into quad and increment by rs_c.
+" st1 {v0.s}[1],[x5],x14                     \n\t" // Store c01  into quad and increment by rs_c.
+" st1 {v0.s}[2],[x5],x14                     \n\t" // Store c02  into quad and increment by rs_c.
+" st1 {v0.s}[3],[x5],x14                     \n\t" // Store c03  into quad and increment by rs_c.
+" st1 {v1.s}[0],[x5],x14                     \n\t" // Store c04  into quad and increment by rs_c.
+" st1 {v1.s}[1],[x5],x14                     \n\t" // Store c05  into quad and increment by rs_c.
+" st1 {v1.s}[2],[x5],x14                     \n\t" // Store c06  into quad and increment by rs_c.
+" st1 {v1.s}[3],[x5],x14                     \n\t" // Store c07  into quad and increment by rs_c.
 "                                            \n\t"
-" mov x27, x16                               \n\t"
+" mov x5, x16                                \n\t"
 "                                            \n\t"
-" st1 {v2.s}[0],[x27],x14                    \n\t" // Store c10  into quad and increment by rs_c.
-" st1 {v2.s}[1],[x27],x14                    \n\t" // Store c11  into quad and increment by rs_c.
-" st1 {v2.s}[2],[x27],x14                    \n\t" // Store c12  into quad and increment by rs_c.
-" st1 {v2.s}[3],[x27],x14                    \n\t" // Store c13  into quad and increment by rs_c.
-" st1 {v3.s}[0],[x27],x14                    \n\t" // Store c14  into quad and increment by rs_c.
-" st1 {v3.s}[1],[x27],x14                    \n\t" // Store c15  into quad and increment by rs_c.
-" st1 {v3.s}[2],[x27],x14                    \n\t" // Store c16  into quad and increment by rs_c.
-" st1 {v3.s}[3],[x27],x14                    \n\t" // Store c17  into quad and increment by rs_c.
+" st1 {v2.s}[0],[x5],x14                     \n\t" // Store c10  into quad and increment by rs_c.
+" st1 {v2.s}[1],[x5],x14                     \n\t" // Store c11  into quad and increment by rs_c.
+" st1 {v2.s}[2],[x5],x14                     \n\t" // Store c12  into quad and increment by rs_c.
+" st1 {v2.s}[3],[x5],x14                     \n\t" // Store c13  into quad and increment by rs_c.
+" st1 {v3.s}[0],[x5],x14                     \n\t" // Store c14  into quad and increment by rs_c.
+" st1 {v3.s}[1],[x5],x14                     \n\t" // Store c15  into quad and increment by rs_c.
+" st1 {v3.s}[2],[x5],x14                     \n\t" // Store c16  into quad and increment by rs_c.
+" st1 {v3.s}[3],[x5],x14                     \n\t" // Store c17  into quad and increment by rs_c.
 "                                            \n\t"
-" mov x27, x17                               \n\t"
+" mov x5, x17                                \n\t"
 "                                            \n\t"
-" st1 {v4.s}[0],[x27],x14                    \n\t" // Store c20  into quad and increment by rs_c.
-" st1 {v4.s}[1],[x27],x14                    \n\t" // Store c21  into quad and increment by rs_c.
-" st1 {v4.s}[2],[x27],x14                    \n\t" // Store c22  into quad and increment by rs_c.
-" st1 {v4.s}[3],[x27],x14                    \n\t" // Store c23  into quad and increment by rs_c.
-" st1 {v5.s}[0],[x27],x14                    \n\t" // Store c24  into quad and increment by rs_c.
-" st1 {v5.s}[1],[x27],x14                    \n\t" // Store c25  into quad and increment by rs_c.
-" st1 {v5.s}[2],[x27],x14                    \n\t" // Store c26  into quad and increment by rs_c.
-" st1 {v5.s}[3],[x27],x14                    \n\t" // Store c27  into quad and increment by rs_c.
+" st1 {v4.s}[0],[x5],x14                     \n\t" // Store c20  into quad and increment by rs_c.
+" st1 {v4.s}[1],[x5],x14                     \n\t" // Store c21  into quad and increment by rs_c.
+" st1 {v4.s}[2],[x5],x14                     \n\t" // Store c22  into quad and increment by rs_c.
+" st1 {v4.s}[3],[x5],x14                     \n\t" // Store c23  into quad and increment by rs_c.
+" st1 {v5.s}[0],[x5],x14                     \n\t" // Store c24  into quad and increment by rs_c.
+" st1 {v5.s}[1],[x5],x14                     \n\t" // Store c25  into quad and increment by rs_c.
+" st1 {v5.s}[2],[x5],x14                     \n\t" // Store c26  into quad and increment by rs_c.
+" st1 {v5.s}[3],[x5],x14                     \n\t" // Store c27  into quad and increment by rs_c.
 "                                            \n\t"
 " dup  v8.4s, wzr                            \n\t"
 " dup  v9.4s, wzr                            \n\t"
@@ -782,38 +782,38 @@ LABEL(SBETAZEROGENSTOREDS1)
 " fcmp s7,#0.0                               \n\t"
 BEQ(SBETAZEROGENSTOREDS2)                          // Taking care of the beta==0 case.
 "                                            \n\t"
-" mov x27, x18                               \n\t"
+" mov x5, x19                                \n\t"
 "                                            \n\t"
-" ld1 {v8.s}[0],[x27],x14                    \n\t" // Load c30  into quad and increment by rs_c.
-" ld1 {v8.s}[1],[x27],x14                    \n\t" // Load c31  into quad and increment by rs_c.
-" ld1 {v8.s}[2],[x27],x14                    \n\t" // Load c32  into quad and increment by rs_c.
-" ld1 {v8.s}[3],[x27],x14                    \n\t" // Load c33  into quad and increment by rs_c.
-" ld1 {v9.s}[0],[x27],x14                    \n\t" // Load c34  into quad and increment by rs_c.
-" ld1 {v9.s}[1],[x27],x14                    \n\t" // Load c35  into quad and increment by rs_c.
-" ld1 {v9.s}[2],[x27],x14                    \n\t" // Load c36  into quad and increment by rs_c.
-" ld1 {v9.s}[3],[x27],x14                    \n\t" // Load c37  into quad and increment by rs_c.
+" ld1 {v8.s}[0],[x5],x14                     \n\t" // Load c30  into quad and increment by rs_c.
+" ld1 {v8.s}[1],[x5],x14                     \n\t" // Load c31  into quad and increment by rs_c.
+" ld1 {v8.s}[2],[x5],x14                     \n\t" // Load c32  into quad and increment by rs_c.
+" ld1 {v8.s}[3],[x5],x14                     \n\t" // Load c33  into quad and increment by rs_c.
+" ld1 {v9.s}[0],[x5],x14                     \n\t" // Load c34  into quad and increment by rs_c.
+" ld1 {v9.s}[1],[x5],x14                     \n\t" // Load c35  into quad and increment by rs_c.
+" ld1 {v9.s}[2],[x5],x14                     \n\t" // Load c36  into quad and increment by rs_c.
+" ld1 {v9.s}[3],[x5],x14                     \n\t" // Load c37  into quad and increment by rs_c.
 "                                            \n\t"
-" mov x27, x19                               \n\t"
+" mov x5, x20                                \n\t"
 "                                            \n\t"
-" ld1 {v10.s}[0],[x27],x14                   \n\t" // Load c40  into quad and increment by rs_c.
-" ld1 {v10.s}[1],[x27],x14                   \n\t" // Load c41  into quad and increment by rs_c.
-" ld1 {v10.s}[2],[x27],x14                   \n\t" // Load c42  into quad and increment by rs_c.
-" ld1 {v10.s}[3],[x27],x14                   \n\t" // Load c43  into quad and increment by rs_c.
-" ld1 {v11.s}[0],[x27],x14                   \n\t" // Load c44  into quad and increment by rs_c.
-" ld1 {v11.s}[1],[x27],x14                   \n\t" // Load c45  into quad and increment by rs_c.
-" ld1 {v11.s}[2],[x27],x14                   \n\t" // Load c46  into quad and increment by rs_c.
-" ld1 {v11.s}[3],[x27],x14                   \n\t" // Load c47  into quad and increment by rs_c.
+" ld1 {v10.s}[0],[x5],x14                    \n\t" // Load c40  into quad and increment by rs_c.
+" ld1 {v10.s}[1],[x5],x14                    \n\t" // Load c41  into quad and increment by rs_c.
+" ld1 {v10.s}[2],[x5],x14                    \n\t" // Load c42  into quad and increment by rs_c.
+" ld1 {v10.s}[3],[x5],x14                    \n\t" // Load c43  into quad and increment by rs_c.
+" ld1 {v11.s}[0],[x5],x14                    \n\t" // Load c44  into quad and increment by rs_c.
+" ld1 {v11.s}[1],[x5],x14                    \n\t" // Load c45  into quad and increment by rs_c.
+" ld1 {v11.s}[2],[x5],x14                    \n\t" // Load c46  into quad and increment by rs_c.
+" ld1 {v11.s}[3],[x5],x14                    \n\t" // Load c47  into quad and increment by rs_c.
 "                                            \n\t"
-" mov x27, x20                               \n\t"
+" mov x5, x21                                \n\t"
 "                                            \n\t"
-" ld1 {v12.s}[0],[x27],x14                   \n\t" // Load c50  into quad and increment by rs_c.
-" ld1 {v12.s}[1],[x27],x14                   \n\t" // Load c51  into quad and increment by rs_c.
-" ld1 {v12.s}[2],[x27],x14                   \n\t" // Load c52  into quad and increment by rs_c.
-" ld1 {v12.s}[3],[x27],x14                   \n\t" // Load c53  into quad and increment by rs_c.
-" ld1 {v13.s}[0],[x27],x14                   \n\t" // Load c54  into quad and increment by rs_c.
-" ld1 {v13.s}[1],[x27],x14                   \n\t" // Load c55  into quad and increment by rs_c.
-" ld1 {v13.s}[2],[x27],x14                   \n\t" // Load c56  into quad and increment by rs_c.
-" ld1 {v13.s}[3],[x27],x14                   \n\t" // Load c57  into quad and increment by rs_c.
+" ld1 {v12.s}[0],[x5],x14                    \n\t" // Load c50  into quad and increment by rs_c.
+" ld1 {v12.s}[1],[x5],x14                    \n\t" // Load c51  into quad and increment by rs_c.
+" ld1 {v12.s}[2],[x5],x14                    \n\t" // Load c52  into quad and increment by rs_c.
+" ld1 {v12.s}[3],[x5],x14                    \n\t" // Load c53  into quad and increment by rs_c.
+" ld1 {v13.s}[0],[x5],x14                    \n\t" // Load c54  into quad and increment by rs_c.
+" ld1 {v13.s}[1],[x5],x14                    \n\t" // Load c55  into quad and increment by rs_c.
+" ld1 {v13.s}[2],[x5],x14                    \n\t" // Load c56  into quad and increment by rs_c.
+" ld1 {v13.s}[3],[x5],x14                    \n\t" // Load c57  into quad and increment by rs_c.
 "                                            \n\t"
 " fmul v8.4s, v8.4s, v7.s[0]                 \n\t" // Scale by beta
 " fmul v9.4s, v9.4s, v7.s[0]                 \n\t" // Scale by beta
@@ -831,38 +831,38 @@ LABEL(SBETAZEROGENSTOREDS2)
 " fmla v12.4s,v18.4s,v6.s[0]                 \n\t" // Scale by alpha
 " fmla v13.4s,v19.4s,v6.s[0]                 \n\t" // Scale by alpha
 "                                            \n\t"
-" mov x27, x18                               \n\t"
+" mov x5, x19                                \n\t"
 "                                            \n\t"
-" st1 {v8.s}[0],[x27],x14                    \n\t" // Store c30  into quad and increment by rs_c.
-" st1 {v8.s}[1],[x27],x14                    \n\t" // Store c31  into quad and increment by rs_c.
-" st1 {v8.s}[2],[x27],x14                    \n\t" // Store c32  into quad and increment by rs_c.
-" st1 {v8.s}[3],[x27],x14                    \n\t" // Store c33  into quad and increment by rs_c.
-" st1 {v9.s}[0],[x27],x14                    \n\t" // Store c34  into quad and increment by rs_c.
-" st1 {v9.s}[1],[x27],x14                    \n\t" // Store c35  into quad and increment by rs_c.
-" st1 {v9.s}[2],[x27],x14                    \n\t" // Store c36  into quad and increment by rs_c.
-" st1 {v9.s}[3],[x27],x14                    \n\t" // Store c37  into quad and increment by rs_c.
+" st1 {v8.s}[0],[x5],x14                     \n\t" // Store c30  into quad and increment by rs_c.
+" st1 {v8.s}[1],[x5],x14                     \n\t" // Store c31  into quad and increment by rs_c.
+" st1 {v8.s}[2],[x5],x14                     \n\t" // Store c32  into quad and increment by rs_c.
+" st1 {v8.s}[3],[x5],x14                     \n\t" // Store c33  into quad and increment by rs_c.
+" st1 {v9.s}[0],[x5],x14                     \n\t" // Store c34  into quad and increment by rs_c.
+" st1 {v9.s}[1],[x5],x14                     \n\t" // Store c35  into quad and increment by rs_c.
+" st1 {v9.s}[2],[x5],x14                     \n\t" // Store c36  into quad and increment by rs_c.
+" st1 {v9.s}[3],[x5],x14                     \n\t" // Store c37  into quad and increment by rs_c.
 "                                            \n\t"
-" mov x27, x19                               \n\t"
+" mov x5, x20                                \n\t"
 "                                            \n\t"
-" st1 {v10.s}[0],[x27],x14                   \n\t" // Store c40  into quad and increment by rs_c.
-" st1 {v10.s}[1],[x27],x14                   \n\t" // Store c41  into quad and increment by rs_c.
-" st1 {v10.s}[2],[x27],x14                   \n\t" // Store c42  into quad and increment by rs_c.
-" st1 {v10.s}[3],[x27],x14                   \n\t" // Store c43  into quad and increment by rs_c.
-" st1 {v11.s}[0],[x27],x14                   \n\t" // Store c44  into quad and increment by rs_c.
-" st1 {v11.s}[1],[x27],x14                   \n\t" // Store c45  into quad and increment by rs_c.
-" st1 {v11.s}[2],[x27],x14                   \n\t" // Store c46  into quad and increment by rs_c.
-" st1 {v11.s}[3],[x27],x14                   \n\t" // Store c47  into quad and increment by rs_c.
+" st1 {v10.s}[0],[x5],x14                    \n\t" // Store c40  into quad and increment by rs_c.
+" st1 {v10.s}[1],[x5],x14                    \n\t" // Store c41  into quad and increment by rs_c.
+" st1 {v10.s}[2],[x5],x14                    \n\t" // Store c42  into quad and increment by rs_c.
+" st1 {v10.s}[3],[x5],x14                    \n\t" // Store c43  into quad and increment by rs_c.
+" st1 {v11.s}[0],[x5],x14                    \n\t" // Store c44  into quad and increment by rs_c.
+" st1 {v11.s}[1],[x5],x14                    \n\t" // Store c45  into quad and increment by rs_c.
+" st1 {v11.s}[2],[x5],x14                    \n\t" // Store c46  into quad and increment by rs_c.
+" st1 {v11.s}[3],[x5],x14                    \n\t" // Store c47  into quad and increment by rs_c.
 "                                            \n\t"
-" mov x27, x20                               \n\t"
+" mov x5, x21                                \n\t"
 "                                            \n\t"
-" st1 {v12.s}[0],[x27],x14                   \n\t" // Store c50  into quad and increment by rs_c.
-" st1 {v12.s}[1],[x27],x14                   \n\t" // Store c51  into quad and increment by rs_c.
-" st1 {v12.s}[2],[x27],x14                   \n\t" // Store c52  into quad and increment by rs_c.
-" st1 {v12.s}[3],[x27],x14                   \n\t" // Store c53  into quad and increment by rs_c.
-" st1 {v13.s}[0],[x27],x14                   \n\t" // Store c54  into quad and increment by rs_c.
-" st1 {v13.s}[1],[x27],x14                   \n\t" // Store c55  into quad and increment by rs_c.
-" st1 {v13.s}[2],[x27],x14                   \n\t" // Store c56  into quad and increment by rs_c.
-" st1 {v13.s}[3],[x27],x14                   \n\t" // Store c57  into quad and increment by rs_c.
+" st1 {v12.s}[0],[x5],x14                    \n\t" // Store c50  into quad and increment by rs_c.
+" st1 {v12.s}[1],[x5],x14                    \n\t" // Store c51  into quad and increment by rs_c.
+" st1 {v12.s}[2],[x5],x14                    \n\t" // Store c52  into quad and increment by rs_c.
+" st1 {v12.s}[3],[x5],x14                    \n\t" // Store c53  into quad and increment by rs_c.
+" st1 {v13.s}[0],[x5],x14                    \n\t" // Store c54  into quad and increment by rs_c.
+" st1 {v13.s}[1],[x5],x14                    \n\t" // Store c55  into quad and increment by rs_c.
+" st1 {v13.s}[2],[x5],x14                    \n\t" // Store c56  into quad and increment by rs_c.
+" st1 {v13.s}[3],[x5],x14                    \n\t" // Store c57  into quad and increment by rs_c.
 "                                            \n\t"
 " dup  v0.4s, wzr                            \n\t"
 " dup  v1.4s, wzr                            \n\t"
@@ -874,38 +874,38 @@ LABEL(SBETAZEROGENSTOREDS2)
 " fcmp s7,#0.0                               \n\t"
 BEQ(SBETAZEROGENSTOREDS3)                          // Taking care of the beta==0 case.
 "                                            \n\t"
-" mov x27, x21                               \n\t"
+" mov x5, x22                                \n\t"
 "                                            \n\t"
-" ld1 {v0.s}[0],[x27],x14                    \n\t" // Load c60  into quad and increment by rs_c.
-" ld1 {v0.s}[1],[x27],x14                    \n\t" // Load c61  into quad and increment by rs_c.
-" ld1 {v0.s}[2],[x27],x14                    \n\t" // Load c62  into quad and increment by rs_c.
-" ld1 {v0.s}[3],[x27],x14                    \n\t" // Load c63  into quad and increment by rs_c.
-" ld1 {v1.s}[0],[x27],x14                    \n\t" // Load c64  into quad and increment by rs_c.
-" ld1 {v1.s}[1],[x27],x14                    \n\t" // Load c65  into quad and increment by rs_c.
-" ld1 {v1.s}[2],[x27],x14                    \n\t" // Load c66  into quad and increment by rs_c.
-" ld1 {v1.s}[3],[x27],x14                    \n\t" // Load c67  into quad and increment by rs_c.
+" ld1 {v0.s}[0],[x5],x14                     \n\t" // Load c60  into quad and increment by rs_c.
+" ld1 {v0.s}[1],[x5],x14                     \n\t" // Load c61  into quad and increment by rs_c.
+" ld1 {v0.s}[2],[x5],x14                     \n\t" // Load c62  into quad and increment by rs_c.
+" ld1 {v0.s}[3],[x5],x14                     \n\t" // Load c63  into quad and increment by rs_c.
+" ld1 {v1.s}[0],[x5],x14                     \n\t" // Load c64  into quad and increment by rs_c.
+" ld1 {v1.s}[1],[x5],x14                     \n\t" // Load c65  into quad and increment by rs_c.
+" ld1 {v1.s}[2],[x5],x14                     \n\t" // Load c66  into quad and increment by rs_c.
+" ld1 {v1.s}[3],[x5],x14                     \n\t" // Load c67  into quad and increment by rs_c.
 "                                            \n\t"
-" mov x27, x22                               \n\t"
+" mov x5, x23                                \n\t"
 "                                            \n\t"
-" ld1 {v2.s}[0],[x27],x14                    \n\t" // Load c70  into quad and increment by rs_c.
-" ld1 {v2.s}[1],[x27],x14                    \n\t" // Load c71  into quad and increment by rs_c.
-" ld1 {v2.s}[2],[x27],x14                    \n\t" // Load c72  into quad and increment by rs_c.
-" ld1 {v2.s}[3],[x27],x14                    \n\t" // Load c73  into quad and increment by rs_c.
-" ld1 {v3.s}[0],[x27],x14                    \n\t" // Load c74  into quad and increment by rs_c.
-" ld1 {v3.s}[1],[x27],x14                    \n\t" // Load c75  into quad and increment by rs_c.
-" ld1 {v3.s}[2],[x27],x14                    \n\t" // Load c76  into quad and increment by rs_c.
-" ld1 {v3.s}[3],[x27],x14                    \n\t" // Load c77  into quad and increment by rs_c.
+" ld1 {v2.s}[0],[x5],x14                     \n\t" // Load c70  into quad and increment by rs_c.
+" ld1 {v2.s}[1],[x5],x14                     \n\t" // Load c71  into quad and increment by rs_c.
+" ld1 {v2.s}[2],[x5],x14                     \n\t" // Load c72  into quad and increment by rs_c.
+" ld1 {v2.s}[3],[x5],x14                     \n\t" // Load c73  into quad and increment by rs_c.
+" ld1 {v3.s}[0],[x5],x14                     \n\t" // Load c74  into quad and increment by rs_c.
+" ld1 {v3.s}[1],[x5],x14                     \n\t" // Load c75  into quad and increment by rs_c.
+" ld1 {v3.s}[2],[x5],x14                     \n\t" // Load c76  into quad and increment by rs_c.
+" ld1 {v3.s}[3],[x5],x14                     \n\t" // Load c77  into quad and increment by rs_c.
 "                                            \n\t"
-" mov x27, x23                               \n\t"
+" mov x5, x24                                \n\t"
 "                                            \n\t"
-" ld1 {v4.s}[0],[x27],x14                    \n\t" // Load c80  into quad and increment by rs_c.
-" ld1 {v4.s}[1],[x27],x14                    \n\t" // Load c81  into quad and increment by rs_c.
-" ld1 {v4.s}[2],[x27],x14                    \n\t" // Load c82  into quad and increment by rs_c.
-" ld1 {v4.s}[3],[x27],x14                    \n\t" // Load c83  into quad and increment by rs_c.
-" ld1 {v5.s}[0],[x27],x14                    \n\t" // Load c84  into quad and increment by rs_c.
-" ld1 {v5.s}[1],[x27],x14                    \n\t" // Load c85  into quad and increment by rs_c.
-" ld1 {v5.s}[2],[x27],x14                    \n\t" // Load c86  into quad and increment by rs_c.
-" ld1 {v5.s}[3],[x27],x14                    \n\t" // Load c87  into quad and increment by rs_c.
+" ld1 {v4.s}[0],[x5],x14                     \n\t" // Load c80  into quad and increment by rs_c.
+" ld1 {v4.s}[1],[x5],x14                     \n\t" // Load c81  into quad and increment by rs_c.
+" ld1 {v4.s}[2],[x5],x14                     \n\t" // Load c82  into quad and increment by rs_c.
+" ld1 {v4.s}[3],[x5],x14                     \n\t" // Load c83  into quad and increment by rs_c.
+" ld1 {v5.s}[0],[x5],x14                     \n\t" // Load c84  into quad and increment by rs_c.
+" ld1 {v5.s}[1],[x5],x14                     \n\t" // Load c85  into quad and increment by rs_c.
+" ld1 {v5.s}[2],[x5],x14                     \n\t" // Load c86  into quad and increment by rs_c.
+" ld1 {v5.s}[3],[x5],x14                     \n\t" // Load c87  into quad and increment by rs_c.
 "                                            \n\t"
 " fmul v0.4s,v0.4s,v7.s[0]                   \n\t" // Scale by beta
 " fmul v1.4s,v1.4s,v7.s[0]                   \n\t" // Scale by beta
@@ -923,38 +923,38 @@ LABEL(SBETAZEROGENSTOREDS3)
 " fmla v4.4s,v24.4s,v6.s[0]                  \n\t" // Scale by alpha
 " fmla v5.4s,v25.4s,v6.s[0]                  \n\t" // Scale by alpha
 "                                            \n\t"
-" mov x27, x21                               \n\t"
+" mov x5, x22                                \n\t"
 "                                            \n\t"
-" st1 {v0.s}[0],[x27],x14                    \n\t" // Store c60  into quad and increment by rs_c.
-" st1 {v0.s}[1],[x27],x14                    \n\t" // Store c61  into quad and increment by rs_c.
-" st1 {v0.s}[2],[x27],x14                    \n\t" // Store c62  into quad and increment by rs_c.
-" st1 {v0.s}[3],[x27],x14                    \n\t" // Store c63  into quad and increment by rs_c.
-" st1 {v1.s}[0],[x27],x14                    \n\t" // Store c64  into quad and increment by rs_c.
-" st1 {v1.s}[1],[x27],x14                    \n\t" // Store c65  into quad and increment by rs_c.
-" st1 {v1.s}[2],[x27],x14                    \n\t" // Store c66  into quad and increment by rs_c.
-" st1 {v1.s}[3],[x27],x14                    \n\t" // Store c67  into quad and increment by rs_c.
+" st1 {v0.s}[0],[x5],x14                     \n\t" // Store c60  into quad and increment by rs_c.
+" st1 {v0.s}[1],[x5],x14                     \n\t" // Store c61  into quad and increment by rs_c.
+" st1 {v0.s}[2],[x5],x14                     \n\t" // Store c62  into quad and increment by rs_c.
+" st1 {v0.s}[3],[x5],x14                     \n\t" // Store c63  into quad and increment by rs_c.
+" st1 {v1.s}[0],[x5],x14                     \n\t" // Store c64  into quad and increment by rs_c.
+" st1 {v1.s}[1],[x5],x14                     \n\t" // Store c65  into quad and increment by rs_c.
+" st1 {v1.s}[2],[x5],x14                     \n\t" // Store c66  into quad and increment by rs_c.
+" st1 {v1.s}[3],[x5],x14                     \n\t" // Store c67  into quad and increment by rs_c.
 "                                            \n\t"
-" mov x27, x22                               \n\t"
+" mov x5, x23                                \n\t"
 "                                            \n\t"
-" st1 {v2.s}[0],[x27],x14                    \n\t" // Store c70  into quad and increment by rs_c.
-" st1 {v2.s}[1],[x27],x14                    \n\t" // Store c71  into quad and increment by rs_c.
-" st1 {v2.s}[2],[x27],x14                    \n\t" // Store c72  into quad and increment by rs_c.
-" st1 {v2.s}[3],[x27],x14                    \n\t" // Store c73  into quad and increment by rs_c.
-" st1 {v3.s}[0],[x27],x14                    \n\t" // Store c74  into quad and increment by rs_c.
-" st1 {v3.s}[1],[x27],x14                    \n\t" // Store c75  into quad and increment by rs_c.
-" st1 {v3.s}[2],[x27],x14                    \n\t" // Store c76  into quad and increment by rs_c.
-" st1 {v3.s}[3],[x27],x14                    \n\t" // Store c77  into quad and increment by rs_c.
+" st1 {v2.s}[0],[x5],x14                     \n\t" // Store c70  into quad and increment by rs_c.
+" st1 {v2.s}[1],[x5],x14                     \n\t" // Store c71  into quad and increment by rs_c.
+" st1 {v2.s}[2],[x5],x14                     \n\t" // Store c72  into quad and increment by rs_c.
+" st1 {v2.s}[3],[x5],x14                     \n\t" // Store c73  into quad and increment by rs_c.
+" st1 {v3.s}[0],[x5],x14                     \n\t" // Store c74  into quad and increment by rs_c.
+" st1 {v3.s}[1],[x5],x14                     \n\t" // Store c75  into quad and increment by rs_c.
+" st1 {v3.s}[2],[x5],x14                     \n\t" // Store c76  into quad and increment by rs_c.
+" st1 {v3.s}[3],[x5],x14                     \n\t" // Store c77  into quad and increment by rs_c.
 "                                            \n\t"
-" mov x27, x23                               \n\t"
+" mov x5, x24                                \n\t"
 "                                            \n\t"
-" st1 {v4.s}[0],[x27],x14                    \n\t" // Store c80  into quad and increment by rs_c.
-" st1 {v4.s}[1],[x27],x14                    \n\t" // Store c81  into quad and increment by rs_c.
-" st1 {v4.s}[2],[x27],x14                    \n\t" // Store c82  into quad and increment by rs_c.
-" st1 {v4.s}[3],[x27],x14                    \n\t" // Store c83  into quad and increment by rs_c.
-" st1 {v5.s}[0],[x27],x14                    \n\t" // Store c84  into quad and increment by rs_c.
-" st1 {v5.s}[1],[x27],x14                    \n\t" // Store c85  into quad and increment by rs_c.
-" st1 {v5.s}[2],[x27],x14                    \n\t" // Store c86  into quad and increment by rs_c.
-" st1 {v5.s}[3],[x27],x14                    \n\t" // Store c87  into quad and increment by rs_c.
+" st1 {v4.s}[0],[x5],x14                     \n\t" // Store c80  into quad and increment by rs_c.
+" st1 {v4.s}[1],[x5],x14                     \n\t" // Store c81  into quad and increment by rs_c.
+" st1 {v4.s}[2],[x5],x14                     \n\t" // Store c82  into quad and increment by rs_c.
+" st1 {v4.s}[3],[x5],x14                     \n\t" // Store c83  into quad and increment by rs_c.
+" st1 {v5.s}[0],[x5],x14                     \n\t" // Store c84  into quad and increment by rs_c.
+" st1 {v5.s}[1],[x5],x14                     \n\t" // Store c85  into quad and increment by rs_c.
+" st1 {v5.s}[2],[x5],x14                     \n\t" // Store c86  into quad and increment by rs_c.
+" st1 {v5.s}[3],[x5],x14                     \n\t" // Store c87  into quad and increment by rs_c.
 "                                            \n\t"
 " dup  v8.4s, wzr                            \n\t"
 " dup  v9.4s, wzr                            \n\t"
@@ -966,38 +966,38 @@ LABEL(SBETAZEROGENSTOREDS3)
 " fcmp s7,#0.0                               \n\t"
 BEQ(SBETAZEROGENSTOREDS4)                          // Taking care of the beta==0 case.
 "                                            \n\t"
-" mov x27, x24                               \n\t"
+" mov x5, x25                                \n\t"
 "                                            \n\t"
-" ld1 {v8.s}[0],[x27],x14                    \n\t" // Load c90  into quad and increment by rs_c.
-" ld1 {v8.s}[1],[x27],x14                    \n\t" // Load c91  into quad and increment by rs_c.
-" ld1 {v8.s}[2],[x27],x14                    \n\t" // Load c92  into quad and increment by rs_c.
-" ld1 {v8.s}[3],[x27],x14                    \n\t" // Load c93  into quad and increment by rs_c.
-" ld1 {v9.s}[0],[x27],x14                    \n\t" // Load c94  into quad and increment by rs_c.
-" ld1 {v9.s}[1],[x27],x14                    \n\t" // Load c95  into quad and increment by rs_c.
-" ld1 {v9.s}[2],[x27],x14                    \n\t" // Load c96  into quad and increment by rs_c.
-" ld1 {v9.s}[3],[x27],x14                    \n\t" // Load c97  into quad and increment by rs_c.
+" ld1 {v8.s}[0],[x5],x14                     \n\t" // Load c90  into quad and increment by rs_c.
+" ld1 {v8.s}[1],[x5],x14                     \n\t" // Load c91  into quad and increment by rs_c.
+" ld1 {v8.s}[2],[x5],x14                     \n\t" // Load c92  into quad and increment by rs_c.
+" ld1 {v8.s}[3],[x5],x14                     \n\t" // Load c93  into quad and increment by rs_c.
+" ld1 {v9.s}[0],[x5],x14                     \n\t" // Load c94  into quad and increment by rs_c.
+" ld1 {v9.s}[1],[x5],x14                     \n\t" // Load c95  into quad and increment by rs_c.
+" ld1 {v9.s}[2],[x5],x14                     \n\t" // Load c96  into quad and increment by rs_c.
+" ld1 {v9.s}[3],[x5],x14                     \n\t" // Load c97  into quad and increment by rs_c.
 "                                            \n\t"
-" mov x27, x25                               \n\t"
+" mov x5, x26                                \n\t"
 "                                            \n\t"
-" ld1 {v10.s}[0],[x27],x14                   \n\t" // Load c100  into quad and increment by rs_c.
-" ld1 {v10.s}[1],[x27],x14                   \n\t" // Load c101  into quad and increment by rs_c.
-" ld1 {v10.s}[2],[x27],x14                   \n\t" // Load c102  into quad and increment by rs_c.
-" ld1 {v10.s}[3],[x27],x14                   \n\t" // Load c103  into quad and increment by rs_c.
-" ld1 {v11.s}[0],[x27],x14                   \n\t" // Load c104  into quad and increment by rs_c.
-" ld1 {v11.s}[1],[x27],x14                   \n\t" // Load c105  into quad and increment by rs_c.
-" ld1 {v11.s}[2],[x27],x14                   \n\t" // Load c106  into quad and increment by rs_c.
-" ld1 {v11.s}[3],[x27],x14                   \n\t" // Load c107  into quad and increment by rs_c.
+" ld1 {v10.s}[0],[x5],x14                    \n\t" // Load c100  into quad and increment by rs_c.
+" ld1 {v10.s}[1],[x5],x14                    \n\t" // Load c101  into quad and increment by rs_c.
+" ld1 {v10.s}[2],[x5],x14                    \n\t" // Load c102  into quad and increment by rs_c.
+" ld1 {v10.s}[3],[x5],x14                    \n\t" // Load c103  into quad and increment by rs_c.
+" ld1 {v11.s}[0],[x5],x14                    \n\t" // Load c104  into quad and increment by rs_c.
+" ld1 {v11.s}[1],[x5],x14                    \n\t" // Load c105  into quad and increment by rs_c.
+" ld1 {v11.s}[2],[x5],x14                    \n\t" // Load c106  into quad and increment by rs_c.
+" ld1 {v11.s}[3],[x5],x14                    \n\t" // Load c107  into quad and increment by rs_c.
 "                                            \n\t"
-" mov x27, x26                               \n\t"
+" mov x5, x27                                \n\t"
 "                                            \n\t"
-" ld1 {v12.s}[0],[x27],x14                   \n\t" // Load c110  into quad and increment by rs_c.
-" ld1 {v12.s}[1],[x27],x14                   \n\t" // Load c111  into quad and increment by rs_c.
-" ld1 {v12.s}[2],[x27],x14                   \n\t" // Load c112  into quad and increment by rs_c.
-" ld1 {v12.s}[3],[x27],x14                   \n\t" // Load c113  into quad and increment by rs_c.
-" ld1 {v13.s}[0],[x27],x14                   \n\t" // Load c114  into quad and increment by rs_c.
-" ld1 {v13.s}[1],[x27],x14                   \n\t" // Load c115  into quad and increment by rs_c.
-" ld1 {v13.s}[2],[x27],x14                   \n\t" // Load c116  into quad and increment by rs_c.
-" ld1 {v13.s}[3],[x27],x14                   \n\t" // Load c117  into quad and increment by rs_c.
+" ld1 {v12.s}[0],[x5],x14                    \n\t" // Load c110  into quad and increment by rs_c.
+" ld1 {v12.s}[1],[x5],x14                    \n\t" // Load c111  into quad and increment by rs_c.
+" ld1 {v12.s}[2],[x5],x14                    \n\t" // Load c112  into quad and increment by rs_c.
+" ld1 {v12.s}[3],[x5],x14                    \n\t" // Load c113  into quad and increment by rs_c.
+" ld1 {v13.s}[0],[x5],x14                    \n\t" // Load c114  into quad and increment by rs_c.
+" ld1 {v13.s}[1],[x5],x14                    \n\t" // Load c115  into quad and increment by rs_c.
+" ld1 {v13.s}[2],[x5],x14                    \n\t" // Load c116  into quad and increment by rs_c.
+" ld1 {v13.s}[3],[x5],x14                    \n\t" // Load c117  into quad and increment by rs_c.
 "                                            \n\t"
 " fmul v8.4s, v8.4s, v7.s[0]                 \n\t" // Scale by beta
 " fmul v9.4s, v9.4s, v7.s[0]                 \n\t" // Scale by beta
@@ -1018,38 +1018,38 @@ LABEL(SBETAZEROGENSTOREDS4)
 " fmla v12.4s,v30.4s,v6.s[0]                 \n\t" // Scale by alpha
 " fmla v13.4s,v31.4s,v6.s[0]                 \n\t" // Scale by alpha
 "                                            \n\t"
-" mov x27, x24                               \n\t"
+" mov x5, x25                                \n\t"
 "                                            \n\t"
-" st1 {v8.s}[0],[x27],x14                    \n\t" // Store c90  into quad and increment by rs_c.
-" st1 {v8.s}[1],[x27],x14                    \n\t" // Store c91  into quad and increment by rs_c.
-" st1 {v8.s}[2],[x27],x14                    \n\t" // Store c92  into quad and increment by rs_c.
-" st1 {v8.s}[3],[x27],x14                    \n\t" // Store c93  into quad and increment by rs_c.
-" st1 {v9.s}[0],[x27],x14                    \n\t" // Store c94  into quad and increment by rs_c.
-" st1 {v9.s}[1],[x27],x14                    \n\t" // Store c95  into quad and increment by rs_c.
-" st1 {v9.s}[2],[x27],x14                    \n\t" // Store c96  into quad and increment by rs_c.
-" st1 {v9.s}[3],[x27],x14                    \n\t" // Store c97  into quad and increment by rs_c.
+" st1 {v8.s}[0],[x5],x14                     \n\t" // Store c90  into quad and increment by rs_c.
+" st1 {v8.s}[1],[x5],x14                     \n\t" // Store c91  into quad and increment by rs_c.
+" st1 {v8.s}[2],[x5],x14                     \n\t" // Store c92  into quad and increment by rs_c.
+" st1 {v8.s}[3],[x5],x14                     \n\t" // Store c93  into quad and increment by rs_c.
+" st1 {v9.s}[0],[x5],x14                     \n\t" // Store c94  into quad and increment by rs_c.
+" st1 {v9.s}[1],[x5],x14                     \n\t" // Store c95  into quad and increment by rs_c.
+" st1 {v9.s}[2],[x5],x14                     \n\t" // Store c96  into quad and increment by rs_c.
+" st1 {v9.s}[3],[x5],x14                     \n\t" // Store c97  into quad and increment by rs_c.
 "                                            \n\t"
-" mov x27, x25                               \n\t"
+" mov x5, x26                                \n\t"
 "                                            \n\t"
-" st1 {v10.s}[0],[x27],x14                   \n\t" // Store c100  into quad and increment by rs_c.
-" st1 {v10.s}[1],[x27],x14                   \n\t" // Store c101  into quad and increment by rs_c.
-" st1 {v10.s}[2],[x27],x14                   \n\t" // Store c102  into quad and increment by rs_c.
-" st1 {v10.s}[3],[x27],x14                   \n\t" // Store c103  into quad and increment by rs_c.
-" st1 {v11.s}[0],[x27],x14                   \n\t" // Store c104  into quad and increment by rs_c.
-" st1 {v11.s}[1],[x27],x14                   \n\t" // Store c105  into quad and increment by rs_c.
-" st1 {v11.s}[2],[x27],x14                   \n\t" // Store c106  into quad and increment by rs_c.
-" st1 {v11.s}[3],[x27],x14                   \n\t" // Store c107  into quad and increment by rs_c.
+" st1 {v10.s}[0],[x5],x14                    \n\t" // Store c100  into quad and increment by rs_c.
+" st1 {v10.s}[1],[x5],x14                    \n\t" // Store c101  into quad and increment by rs_c.
+" st1 {v10.s}[2],[x5],x14                    \n\t" // Store c102  into quad and increment by rs_c.
+" st1 {v10.s}[3],[x5],x14                    \n\t" // Store c103  into quad and increment by rs_c.
+" st1 {v11.s}[0],[x5],x14                    \n\t" // Store c104  into quad and increment by rs_c.
+" st1 {v11.s}[1],[x5],x14                    \n\t" // Store c105  into quad and increment by rs_c.
+" st1 {v11.s}[2],[x5],x14                    \n\t" // Store c106  into quad and increment by rs_c.
+" st1 {v11.s}[3],[x5],x14                    \n\t" // Store c107  into quad and increment by rs_c.
 "                                            \n\t"
-" mov x27, x26                               \n\t"
+" mov x5, x27                                \n\t"
 "                                            \n\t"
-" st1 {v12.s}[0],[x27],x14                   \n\t" // Store c110  into quad and increment by rs_c.
-" st1 {v12.s}[1],[x27],x14                   \n\t" // Store c111  into quad and increment by rs_c.
-" st1 {v12.s}[2],[x27],x14                   \n\t" // Store c112  into quad and increment by rs_c.
-" st1 {v12.s}[3],[x27],x14                   \n\t" // Store c113  into quad and increment by rs_c.
-" st1 {v13.s}[0],[x27],x14                   \n\t" // Store c114  into quad and increment by rs_c.
-" st1 {v13.s}[1],[x27],x14                   \n\t" // Store c115  into quad and increment by rs_c.
-" st1 {v13.s}[2],[x27],x14                   \n\t" // Store c116  into quad and increment by rs_c.
-" st1 {v13.s}[3],[x27],x14                   \n\t" // Store c147  into quad and increment by rs_c.
+" st1 {v12.s}[0],[x5],x14                    \n\t" // Store c110  into quad and increment by rs_c.
+" st1 {v12.s}[1],[x5],x14                    \n\t" // Store c111  into quad and increment by rs_c.
+" st1 {v12.s}[2],[x5],x14                    \n\t" // Store c112  into quad and increment by rs_c.
+" st1 {v12.s}[3],[x5],x14                    \n\t" // Store c113  into quad and increment by rs_c.
+" st1 {v13.s}[0],[x5],x14                    \n\t" // Store c114  into quad and increment by rs_c.
+" st1 {v13.s}[1],[x5],x14                    \n\t" // Store c115  into quad and increment by rs_c.
+" st1 {v13.s}[2],[x5],x14                    \n\t" // Store c116  into quad and increment by rs_c.
+" st1 {v13.s}[3],[x5],x14                    \n\t" // Store c147  into quad and increment by rs_c.
 "                                            \n\t"
 LABEL(SEND)                                        // Done!
 "                                            \n\t"
@@ -1068,11 +1068,10 @@ LABEL(SEND)                                        // Done!
  [b_next] "m" (b_next) // 10
 :// Register clobber list
  "x0", "x1", "x2",
- "x5", "x6", "x10",
- "x14",
- "x16","x17","x18","x19",       
- "x20","x21","x22","x23",
- "x24","x25","x26","x27",
+ "x5", "x6", "x10","x14",
+ "x16","x17","x19","x20",
+ "x21","x22","x23","x24",
+ "x25","x26","x27",
  "v0", "v1", "v2", "v3",
  "v4", "v5", "v6", "v7",
  "v8", "v9", "v10","v11",