Armv8A Rename Regs for Safe Darwin Compile

Avoid x18 use in FP32 kernel: - C address lines x[18-26] renamed to x[19-27] (reg index +1) - Original role of x27 fulfilled by x5 which is free after k-loop pert. FP64 does not require changing since x18 is not used there.
2026-05-11 17:50:00 +00:00 · 2021-05-29 18:44:47 +09:00
parent 9f4a4a3cfb
commit 5fc93e2806
1 changed files with 266 additions and 267 deletions
--- a/kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c
+++ b/kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c
@@ -93,20 +93,19 @@ __asm__ volatile
 "                                            \n\t"
 " add x16,x2,x10                             \n\t" //Load address Column 1 of C
 " add x17,x16,x10                            \n\t" //Load address Column 2 of C
-" add x18,x17,x10                            \n\t" //Load address Column 3 of C
-" add x19,x18,x10                            \n\t" //Load address Column 4 of C
-" add x20,x19,x10                            \n\t" //Load address Column 5 of C
-" add x21,x20,x10                            \n\t" //Load address Column 6 of C
-" add x22,x21,x10                            \n\t" //Load address Column 7 of C
-" add x23,x22,x10                            \n\t" //Load address Column 8 of C
-" add x24,x23,x10                            \n\t" //Load address Column 9 of C
-" add x25,x24,x10                            \n\t" //Load address Column 10 of C
-" add x26,x25,x10                            \n\t" //Load address Column 11 of C
+" add x19,x17,x10                            \n\t" //Load address Column 3 of C
+" add x20,x19,x10                            \n\t" //Load address Column 4 of C
+" add x21,x20,x10                            \n\t" //Load address Column 5 of C
+" add x22,x21,x10                            \n\t" //Load address Column 6 of C
+" add x23,x22,x10                            \n\t" //Load address Column 7 of C
+" add x24,x23,x10                            \n\t" //Load address Column 8 of C
+" add x25,x24,x10                            \n\t" //Load address Column 9 of C
+" add x26,x25,x10                            \n\t" //Load address Column 10 of C
+" add x27,x26,x10                            \n\t" //Load address Column 11 of C
 "                                            \n\t"
 " prfm pldl1keep,[x2]                        \n\t" // Prefetch c.
 " prfm pldl1keep,[x16]                       \n\t" // Prefetch c.
 " prfm pldl1keep,[x17]                       \n\t" // Prefetch c.
-" prfm pldl1keep,[x18]                       \n\t" // Prefetch c.
 " prfm pldl1keep,[x19]                       \n\t" // Prefetch c.
 " prfm pldl1keep,[x20]                       \n\t" // Prefetch c.
 " prfm pldl1keep,[x21]                       \n\t" // Prefetch c.
@@ -115,6 +114,7 @@ __asm__ volatile
 " prfm pldl1keep,[x24]                       \n\t" // Prefetch c.
 " prfm pldl1keep,[x25]                       \n\t" // Prefetch c.
 " prfm pldl1keep,[x26]                       \n\t" // Prefetch c.
+" prfm pldl1keep,[x27]                       \n\t" // Prefetch c.
 "                                            \n\t"
 " dup  v8.4s, wzr                            \n\t" // Vector for accummulating column 0
 " prfm    PLDL1KEEP, [x1, #192]              \n\t" 
@@ -560,12 +560,12 @@ LABEL(SBETAZEROCOLSTOREDS1)
 " fcmp s7,#0.0                               \n\t"
 BEQ(SBETAZEROCOLSTOREDS2)                          // Taking care of the beta==0 case.
 "                                            \n\t"
-" ldr q8, [x18]                              \n\t" //Load column 3 of C
-" ldr q9, [x18, #16]                         \n\t"
-" ldr q10, [x19]                             \n\t" //Load column 4 of C
-" ldr q11, [x19, #16]                        \n\t"
-" ldr q12, [x20]                             \n\t" //Load column 5 of C
-" ldr q13, [x20, #16]                        \n\t"
+" ldr q8, [x19]                              \n\t" //Load column 3 of C
+" ldr q9, [x19, #16]                         \n\t"
+" ldr q10, [x20]                             \n\t" //Load column 4 of C
+" ldr q11, [x20, #16]                        \n\t"
+" ldr q12, [x21]                             \n\t" //Load column 5 of C
+" ldr q13, [x21, #16]                        \n\t"
 "                                            \n\t"
 " fmul v8.4s, v8.4s, v7.s[0]                 \n\t" // Scale by beta
 " fmul v9.4s, v9.4s, v7.s[0]                 \n\t" // Scale by beta
@@ -583,12 +583,12 @@ LABEL(SBETAZEROCOLSTOREDS2)
 " fmla v12.4s,v18.4s,v6.s[0]                 \n\t" // Scale by alpha
 " fmla v13.4s,v19.4s,v6.s[0]                 \n\t" // Scale by alpha
 "                                            \n\t"
-" str q8, [x18]                              \n\t" //Store column 3 of C
-" str q9, [x18, #16]                         \n\t"
-" str q10, [x19]                             \n\t" //Store column 4 of C
-" str q11, [x19, #16]                        \n\t"
-" str q12, [x20]                             \n\t" //Store column 5 of C
-" str q13, [x20, #16]                        \n\t"
+" str q8, [x19]                              \n\t" //Store column 3 of C
+" str q9, [x19, #16]                         \n\t"
+" str q10, [x20]                             \n\t" //Store column 4 of C
+" str q11, [x20, #16]                        \n\t"
+" str q12, [x21]                             \n\t" //Store column 5 of C
+" str q13, [x21, #16]                        \n\t"
 "                                            \n\t"
 " dup  v0.4s, wzr                            \n\t"
 " dup  v1.4s, wzr                            \n\t"
@@ -600,12 +600,12 @@ LABEL(SBETAZEROCOLSTOREDS2)
 " fcmp s7,#0.0                               \n\t"
 BEQ(SBETAZEROCOLSTOREDS3)                          // Taking care of the beta==0 case.
 "                                            \n\t"
-" ldr q0, [x21]                              \n\t" //Load column 6 of C
-" ldr q1, [x21, #16]                         \n\t"
-" ldr q2, [x22]                              \n\t" //Load column 7 of C
-" ldr q3, [x22, #16]                         \n\t"
-" ldr q4, [x23]                              \n\t" //Load column 8 of C
-" ldr q5, [x23, #16]                         \n\t"
+" ldr q0, [x22]                              \n\t" //Load column 6 of C
+" ldr q1, [x22, #16]                         \n\t"
+" ldr q2, [x23]                              \n\t" //Load column 7 of C
+" ldr q3, [x23, #16]                         \n\t"
+" ldr q4, [x24]                              \n\t" //Load column 8 of C
+" ldr q5, [x24, #16]                         \n\t"
 "                                            \n\t"
 " fmul v0.4s,v0.4s,v7.s[0]                   \n\t" // Scale by beta
 " fmul v1.4s,v1.4s,v7.s[0]                   \n\t" // Scale by beta
@@ -623,12 +623,12 @@ LABEL(SBETAZEROCOLSTOREDS3)
 " fmla v4.4s,v24.4s,v6.s[0]                  \n\t" // Scale by alpha
 " fmla v5.4s,v25.4s,v6.s[0]                  \n\t" // Scale by alpha
 "                                            \n\t"
-" str q0, [x21]                              \n\t" //Store column 6 of C
-" str q1, [x21, #16]                         \n\t"
-" str q2, [x22]                              \n\t" //Store column 7 of C
-" str q3, [x22, #16]                         \n\t"
-" str q4, [x23]                              \n\t" //Store column 8 of C
-" str q5, [x23, #16]                         \n\t"
+" str q0, [x22]                              \n\t" //Store column 6 of C
+" str q1, [x22, #16]                         \n\t"
+" str q2, [x23]                              \n\t" //Store column 7 of C
+" str q3, [x23, #16]                         \n\t"
+" str q4, [x24]                              \n\t" //Store column 8 of C
+" str q5, [x24, #16]                         \n\t"
 "                                            \n\t"
 " dup  v8.4s, wzr                            \n\t"
 " dup  v9.4s, wzr                            \n\t"
@@ -640,12 +640,12 @@ LABEL(SBETAZEROCOLSTOREDS3)
 " fcmp s7,#0.0                               \n\t"
 BEQ(SBETAZEROCOLSTOREDS4)                          // Taking care of the beta==0 case.
 "                                            \n\t"
-" ldr q8, [x24]                              \n\t" //Load column 9 of C
-" ldr q9, [x24, #16]                         \n\t"
-" ldr q10, [x25]                             \n\t" //Load column 10 of C
-" ldr q11, [x25, #16]                        \n\t"
-" ldr q12, [x26]                             \n\t" //Load column 11 of C
-" ldr q13, [x26, #16]                        \n\t"
+" ldr q8, [x25]                              \n\t" //Load column 9 of C
+" ldr q9, [x25, #16]                         \n\t"
+" ldr q10, [x26]                             \n\t" //Load column 10 of C
+" ldr q11, [x26, #16]                        \n\t"
+" ldr q12, [x27]                             \n\t" //Load column 11 of C
+" ldr q13, [x27, #16]                        \n\t"
 "                                            \n\t"
 " fmul v8.4s, v8.4s, v7.s[0]                 \n\t" // Scale by beta
 " fmul v9.4s, v9.4s, v7.s[0]                 \n\t" // Scale by beta
@@ -666,12 +666,12 @@ LABEL(SBETAZEROCOLSTOREDS4)
 " fmla v12.4s,v30.4s,v6.s[0]                 \n\t" // Scale by alpha
 " fmla v13.4s,v31.4s,v6.s[0]                 \n\t" // Scale by alpha
 "                                            \n\t"
-" str q8, [x24]                              \n\t" //Store column 9 of C
-" str q9, [x24, #16]                         \n\t"
-" str q10, [x25]                             \n\t" //Store column 10 of C
-" str q11, [x25, #16]                        \n\t"
-" str q12, [x26]                             \n\t" //Store column 11 of C
-" str q13, [x26, #16]                        \n\t"
+" str q8, [x25]                              \n\t" //Store column 9 of C
+" str q9, [x25, #16]                         \n\t"
+" str q10, [x26]                             \n\t" //Store column 10 of C
+" str q11, [x26, #16]                        \n\t"
+" str q12, [x27]                             \n\t" //Store column 11 of C
+" str q13, [x27, #16]                        \n\t"
 "                                            \n\t"
 "                                            \n\t"
 BRANCH(SEND)                                       // Done.
@@ -690,38 +690,38 @@ LABEL(SGENSTORED)                                  // C is general-stride stored
 " fcmp s7,#0.0                               \n\t"
 BEQ(SBETAZEROGENSTOREDS1)                          // Taking care of the beta==0 case.
 "                                            \n\t"
-" mov x27, x2                                \n\t"
+" mov x5, x2                                 \n\t"
 "                                            \n\t"
-" ld1 {v0.s}[0],[x27],x14                    \n\t" // Load c00  into quad and increment by rs_c.
-" ld1 {v0.s}[1],[x27],x14                    \n\t" // Load c01  into quad and increment by rs_c.
-" ld1 {v0.s}[2],[x27],x14                    \n\t" // Load c02  into quad and increment by rs_c.
-" ld1 {v0.s}[3],[x27],x14                    \n\t" // Load c03  into quad and increment by rs_c.
-" ld1 {v1.s}[0],[x27],x14                    \n\t" // Load c04  into quad and increment by rs_c.
-" ld1 {v1.s}[1],[x27],x14                    \n\t" // Load c05  into quad and increment by rs_c.
-" ld1 {v1.s}[2],[x27],x14                    \n\t" // Load c06  into quad and increment by rs_c.
-" ld1 {v1.s}[3],[x27],x14                    \n\t" // Load c07  into quad and increment by rs_c.
+" ld1 {v0.s}[0],[x5],x14                     \n\t" // Load c00  into quad and increment by rs_c.
+" ld1 {v0.s}[1],[x5],x14                     \n\t" // Load c01  into quad and increment by rs_c.
+" ld1 {v0.s}[2],[x5],x14                     \n\t" // Load c02  into quad and increment by rs_c.
+" ld1 {v0.s}[3],[x5],x14                     \n\t" // Load c03  into quad and increment by rs_c.
+" ld1 {v1.s}[0],[x5],x14                     \n\t" // Load c04  into quad and increment by rs_c.
+" ld1 {v1.s}[1],[x5],x14                     \n\t" // Load c05  into quad and increment by rs_c.
+" ld1 {v1.s}[2],[x5],x14                     \n\t" // Load c06  into quad and increment by rs_c.
+" ld1 {v1.s}[3],[x5],x14                     \n\t" // Load c07  into quad and increment by rs_c.
 "                                            \n\t"
-" mov x27, x16                               \n\t"
+" mov x5, x16                                \n\t"
 "                                            \n\t"
-" ld1 {v2.s}[0],[x27],x14                    \n\t" // Load c10  into quad and increment by rs_c.
-" ld1 {v2.s}[1],[x27],x14                    \n\t" // Load c11  into quad and increment by rs_c.
-" ld1 {v2.s}[2],[x27],x14                    \n\t" // Load c12  into quad and increment by rs_c.
-" ld1 {v2.s}[3],[x27],x14                    \n\t" // Load c13  into quad and increment by rs_c.
-" ld1 {v3.s}[0],[x27],x14                    \n\t" // Load c14  into quad and increment by rs_c.
-" ld1 {v3.s}[1],[x27],x14                    \n\t" // Load c15  into quad and increment by rs_c.
-" ld1 {v3.s}[2],[x27],x14                    \n\t" // Load c16  into quad and increment by rs_c.
-" ld1 {v3.s}[3],[x27],x14                    \n\t" // Load c17  into quad and increment by rs_c.
+" ld1 {v2.s}[0],[x5],x14                     \n\t" // Load c10  into quad and increment by rs_c.
+" ld1 {v2.s}[1],[x5],x14                     \n\t" // Load c11  into quad and increment by rs_c.
+" ld1 {v2.s}[2],[x5],x14                     \n\t" // Load c12  into quad and increment by rs_c.
+" ld1 {v2.s}[3],[x5],x14                     \n\t" // Load c13  into quad and increment by rs_c.
+" ld1 {v3.s}[0],[x5],x14                     \n\t" // Load c14  into quad and increment by rs_c.
+" ld1 {v3.s}[1],[x5],x14                     \n\t" // Load c15  into quad and increment by rs_c.
+" ld1 {v3.s}[2],[x5],x14                     \n\t" // Load c16  into quad and increment by rs_c.
+" ld1 {v3.s}[3],[x5],x14                     \n\t" // Load c17  into quad and increment by rs_c.
 "                                            \n\t"
-" mov x27, x17                               \n\t"
+" mov x5, x17                                \n\t"
 "                                            \n\t"
-" ld1 {v4.s}[0],[x27],x14                    \n\t" // Load c20  into quad and increment by rs_c.
-" ld1 {v4.s}[1],[x27],x14                    \n\t" // Load c21  into quad and increment by rs_c.
-" ld1 {v4.s}[2],[x27],x14                    \n\t" // Load c22  into quad and increment by rs_c.
-" ld1 {v4.s}[3],[x27],x14                    \n\t" // Load c23  into quad and increment by rs_c.
-" ld1 {v5.s}[0],[x27],x14                    \n\t" // Load c24  into quad and increment by rs_c.
-" ld1 {v5.s}[1],[x27],x14                    \n\t" // Load c25  into quad and increment by rs_c.
-" ld1 {v5.s}[2],[x27],x14                    \n\t" // Load c26  into quad and increment by rs_c.
-" ld1 {v5.s}[3],[x27],x14                    \n\t" // Load c27  into quad and increment by rs_c.
+" ld1 {v4.s}[0],[x5],x14                     \n\t" // Load c20  into quad and increment by rs_c.
+" ld1 {v4.s}[1],[x5],x14                     \n\t" // Load c21  into quad and increment by rs_c.
+" ld1 {v4.s}[2],[x5],x14                     \n\t" // Load c22  into quad and increment by rs_c.
+" ld1 {v4.s}[3],[x5],x14                     \n\t" // Load c23  into quad and increment by rs_c.
+" ld1 {v5.s}[0],[x5],x14                     \n\t" // Load c24  into quad and increment by rs_c.
+" ld1 {v5.s}[1],[x5],x14                     \n\t" // Load c25  into quad and increment by rs_c.
+" ld1 {v5.s}[2],[x5],x14                     \n\t" // Load c26  into quad and increment by rs_c.
+" ld1 {v5.s}[3],[x5],x14                     \n\t" // Load c27  into quad and increment by rs_c.
 "                                            \n\t"
 " fmul v0.4s,v0.4s,v7.s[0]                   \n\t" // Scale by beta
 " fmul v1.4s,v1.4s,v7.s[0]                   \n\t" // Scale by beta
@@ -739,38 +739,38 @@ LABEL(SBETAZEROGENSTOREDS1)
 " fmla v4.4s,v12.4s,v6.s[0]                  \n\t" // Scale by alpha
 " fmla v5.4s,v13.4s,v6.s[0]                  \n\t" // Scale by alpha
 "                                            \n\t"
-" mov x27, x2                                \n\t"
+" mov x5, x2                                 \n\t"
 "                                            \n\t"
-" st1 {v0.s}[0],[x27],x14                    \n\t" // Store c00  into quad and increment by rs_c.
-" st1 {v0.s}[1],[x27],x14                    \n\t" // Store c01  into quad and increment by rs_c.
-" st1 {v0.s}[2],[x27],x14                    \n\t" // Store c02  into quad and increment by rs_c.
-" st1 {v0.s}[3],[x27],x14                    \n\t" // Store c03  into quad and increment by rs_c.
-" st1 {v1.s}[0],[x27],x14                    \n\t" // Store c04  into quad and increment by rs_c.
-" st1 {v1.s}[1],[x27],x14                    \n\t" // Store c05  into quad and increment by rs_c.
-" st1 {v1.s}[2],[x27],x14                    \n\t" // Store c06  into quad and increment by rs_c.
-" st1 {v1.s}[3],[x27],x14                    \n\t" // Store c07  into quad and increment by rs_c.
+" st1 {v0.s}[0],[x5],x14                     \n\t" // Store c00  into quad and increment by rs_c.
+" st1 {v0.s}[1],[x5],x14                     \n\t" // Store c01  into quad and increment by rs_c.
+" st1 {v0.s}[2],[x5],x14                     \n\t" // Store c02  into quad and increment by rs_c.
+" st1 {v0.s}[3],[x5],x14                     \n\t" // Store c03  into quad and increment by rs_c.
+" st1 {v1.s}[0],[x5],x14                     \n\t" // Store c04  into quad and increment by rs_c.
+" st1 {v1.s}[1],[x5],x14                     \n\t" // Store c05  into quad and increment by rs_c.
+" st1 {v1.s}[2],[x5],x14                     \n\t" // Store c06  into quad and increment by rs_c.
+" st1 {v1.s}[3],[x5],x14                     \n\t" // Store c07  into quad and increment by rs_c.
 "                                            \n\t"
-" mov x27, x16                               \n\t"
+" mov x5, x16                                \n\t"
 "                                            \n\t"
-" st1 {v2.s}[0],[x27],x14                    \n\t" // Store c10  into quad and increment by rs_c.
-" st1 {v2.s}[1],[x27],x14                    \n\t" // Store c11  into quad and increment by rs_c.
-" st1 {v2.s}[2],[x27],x14                    \n\t" // Store c12  into quad and increment by rs_c.
-" st1 {v2.s}[3],[x27],x14                    \n\t" // Store c13  into quad and increment by rs_c.
-" st1 {v3.s}[0],[x27],x14                    \n\t" // Store c14  into quad and increment by rs_c.
-" st1 {v3.s}[1],[x27],x14                    \n\t" // Store c15  into quad and increment by rs_c.
-" st1 {v3.s}[2],[x27],x14                    \n\t" // Store c16  into quad and increment by rs_c.
-" st1 {v3.s}[3],[x27],x14                    \n\t" // Store c17  into quad and increment by rs_c.
+" st1 {v2.s}[0],[x5],x14                     \n\t" // Store c10  into quad and increment by rs_c.
+" st1 {v2.s}[1],[x5],x14                     \n\t" // Store c11  into quad and increment by rs_c.
+" st1 {v2.s}[2],[x5],x14                     \n\t" // Store c12  into quad and increment by rs_c.
+" st1 {v2.s}[3],[x5],x14                     \n\t" // Store c13  into quad and increment by rs_c.
+" st1 {v3.s}[0],[x5],x14                     \n\t" // Store c14  into quad and increment by rs_c.
+" st1 {v3.s}[1],[x5],x14                     \n\t" // Store c15  into quad and increment by rs_c.
+" st1 {v3.s}[2],[x5],x14                     \n\t" // Store c16  into quad and increment by rs_c.
+" st1 {v3.s}[3],[x5],x14                     \n\t" // Store c17  into quad and increment by rs_c.
 "                                            \n\t"
-" mov x27, x17                               \n\t"
+" mov x5, x17                                \n\t"
 "                                            \n\t"
-" st1 {v4.s}[0],[x27],x14                    \n\t" // Store c20  into quad and increment by rs_c.
-" st1 {v4.s}[1],[x27],x14                    \n\t" // Store c21  into quad and increment by rs_c.
-" st1 {v4.s}[2],[x27],x14                    \n\t" // Store c22  into quad and increment by rs_c.
-" st1 {v4.s}[3],[x27],x14                    \n\t" // Store c23  into quad and increment by rs_c.
-" st1 {v5.s}[0],[x27],x14                    \n\t" // Store c24  into quad and increment by rs_c.
-" st1 {v5.s}[1],[x27],x14                    \n\t" // Store c25  into quad and increment by rs_c.
-" st1 {v5.s}[2],[x27],x14                    \n\t" // Store c26  into quad and increment by rs_c.
-" st1 {v5.s}[3],[x27],x14                    \n\t" // Store c27  into quad and increment by rs_c.
+" st1 {v4.s}[0],[x5],x14                     \n\t" // Store c20  into quad and increment by rs_c.
+" st1 {v4.s}[1],[x5],x14                     \n\t" // Store c21  into quad and increment by rs_c.
+" st1 {v4.s}[2],[x5],x14                     \n\t" // Store c22  into quad and increment by rs_c.
+" st1 {v4.s}[3],[x5],x14                     \n\t" // Store c23  into quad and increment by rs_c.
+" st1 {v5.s}[0],[x5],x14                     \n\t" // Store c24  into quad and increment by rs_c.
+" st1 {v5.s}[1],[x5],x14                     \n\t" // Store c25  into quad and increment by rs_c.
+" st1 {v5.s}[2],[x5],x14                     \n\t" // Store c26  into quad and increment by rs_c.
+" st1 {v5.s}[3],[x5],x14                     \n\t" // Store c27  into quad and increment by rs_c.
 "                                            \n\t"
 " dup  v8.4s, wzr                            \n\t"
 " dup  v9.4s, wzr                            \n\t"
@@ -782,38 +782,38 @@ LABEL(SBETAZEROGENSTOREDS1)
 " fcmp s7,#0.0                               \n\t"
 BEQ(SBETAZEROGENSTOREDS2)                          // Taking care of the beta==0 case.
 "                                            \n\t"
-" mov x27, x18                               \n\t"
+" mov x5, x19                                \n\t"
 "                                            \n\t"
-" ld1 {v8.s}[0],[x27],x14                    \n\t" // Load c30  into quad and increment by rs_c.
-" ld1 {v8.s}[1],[x27],x14                    \n\t" // Load c31  into quad and increment by rs_c.
-" ld1 {v8.s}[2],[x27],x14                    \n\t" // Load c32  into quad and increment by rs_c.
-" ld1 {v8.s}[3],[x27],x14                    \n\t" // Load c33  into quad and increment by rs_c.
-" ld1 {v9.s}[0],[x27],x14                    \n\t" // Load c34  into quad and increment by rs_c.
-" ld1 {v9.s}[1],[x27],x14                    \n\t" // Load c35  into quad and increment by rs_c.
-" ld1 {v9.s}[2],[x27],x14                    \n\t" // Load c36  into quad and increment by rs_c.
-" ld1 {v9.s}[3],[x27],x14                    \n\t" // Load c37  into quad and increment by rs_c.
+" ld1 {v8.s}[0],[x5],x14                     \n\t" // Load c30  into quad and increment by rs_c.
+" ld1 {v8.s}[1],[x5],x14                     \n\t" // Load c31  into quad and increment by rs_c.
+" ld1 {v8.s}[2],[x5],x14                     \n\t" // Load c32  into quad and increment by rs_c.
+" ld1 {v8.s}[3],[x5],x14                     \n\t" // Load c33  into quad and increment by rs_c.
+" ld1 {v9.s}[0],[x5],x14                     \n\t" // Load c34  into quad and increment by rs_c.
+" ld1 {v9.s}[1],[x5],x14                     \n\t" // Load c35  into quad and increment by rs_c.
+" ld1 {v9.s}[2],[x5],x14                     \n\t" // Load c36  into quad and increment by rs_c.
+" ld1 {v9.s}[3],[x5],x14                     \n\t" // Load c37  into quad and increment by rs_c.
 "                                            \n\t"
-" mov x27, x19                               \n\t"
+" mov x5, x20                                \n\t"
 "                                            \n\t"
-" ld1 {v10.s}[0],[x27],x14                   \n\t" // Load c40  into quad and increment by rs_c.
-" ld1 {v10.s}[1],[x27],x14                   \n\t" // Load c41  into quad and increment by rs_c.
-" ld1 {v10.s}[2],[x27],x14                   \n\t" // Load c42  into quad and increment by rs_c.
-" ld1 {v10.s}[3],[x27],x14                   \n\t" // Load c43  into quad and increment by rs_c.
-" ld1 {v11.s}[0],[x27],x14                   \n\t" // Load c44  into quad and increment by rs_c.
-" ld1 {v11.s}[1],[x27],x14                   \n\t" // Load c45  into quad and increment by rs_c.
-" ld1 {v11.s}[2],[x27],x14                   \n\t" // Load c46  into quad and increment by rs_c.
-" ld1 {v11.s}[3],[x27],x14                   \n\t" // Load c47  into quad and increment by rs_c.
+" ld1 {v10.s}[0],[x5],x14                    \n\t" // Load c40  into quad and increment by rs_c.
+" ld1 {v10.s}[1],[x5],x14                    \n\t" // Load c41  into quad and increment by rs_c.
+" ld1 {v10.s}[2],[x5],x14                    \n\t" // Load c42  into quad and increment by rs_c.
+" ld1 {v10.s}[3],[x5],x14                    \n\t" // Load c43  into quad and increment by rs_c.
+" ld1 {v11.s}[0],[x5],x14                    \n\t" // Load c44  into quad and increment by rs_c.
+" ld1 {v11.s}[1],[x5],x14                    \n\t" // Load c45  into quad and increment by rs_c.
+" ld1 {v11.s}[2],[x5],x14                    \n\t" // Load c46  into quad and increment by rs_c.
+" ld1 {v11.s}[3],[x5],x14                    \n\t" // Load c47  into quad and increment by rs_c.
 "                                            \n\t"
-" mov x27, x20                               \n\t"
+" mov x5, x21                                \n\t"
 "                                            \n\t"
-" ld1 {v12.s}[0],[x27],x14                   \n\t" // Load c50  into quad and increment by rs_c.
-" ld1 {v12.s}[1],[x27],x14                   \n\t" // Load c51  into quad and increment by rs_c.
-" ld1 {v12.s}[2],[x27],x14                   \n\t" // Load c52  into quad and increment by rs_c.
-" ld1 {v12.s}[3],[x27],x14                   \n\t" // Load c53  into quad and increment by rs_c.
-" ld1 {v13.s}[0],[x27],x14                   \n\t" // Load c54  into quad and increment by rs_c.
-" ld1 {v13.s}[1],[x27],x14                   \n\t" // Load c55  into quad and increment by rs_c.
-" ld1 {v13.s}[2],[x27],x14                   \n\t" // Load c56  into quad and increment by rs_c.
-" ld1 {v13.s}[3],[x27],x14                   \n\t" // Load c57  into quad and increment by rs_c.
+" ld1 {v12.s}[0],[x5],x14                    \n\t" // Load c50  into quad and increment by rs_c.
+" ld1 {v12.s}[1],[x5],x14                    \n\t" // Load c51  into quad and increment by rs_c.
+" ld1 {v12.s}[2],[x5],x14                    \n\t" // Load c52  into quad and increment by rs_c.
+" ld1 {v12.s}[3],[x5],x14                    \n\t" // Load c53  into quad and increment by rs_c.
+" ld1 {v13.s}[0],[x5],x14                    \n\t" // Load c54  into quad and increment by rs_c.
+" ld1 {v13.s}[1],[x5],x14                    \n\t" // Load c55  into quad and increment by rs_c.
+" ld1 {v13.s}[2],[x5],x14                    \n\t" // Load c56  into quad and increment by rs_c.
+" ld1 {v13.s}[3],[x5],x14                    \n\t" // Load c57  into quad and increment by rs_c.
 "                                            \n\t"
 " fmul v8.4s, v8.4s, v7.s[0]                 \n\t" // Scale by beta
 " fmul v9.4s, v9.4s, v7.s[0]                 \n\t" // Scale by beta
@@ -831,38 +831,38 @@ LABEL(SBETAZEROGENSTOREDS2)
 " fmla v12.4s,v18.4s,v6.s[0]                 \n\t" // Scale by alpha
 " fmla v13.4s,v19.4s,v6.s[0]                 \n\t" // Scale by alpha
 "                                            \n\t"
-" mov x27, x18                               \n\t"
+" mov x5, x19                                \n\t"
 "                                            \n\t"
-" st1 {v8.s}[0],[x27],x14                    \n\t" // Store c30  into quad and increment by rs_c.
-" st1 {v8.s}[1],[x27],x14                    \n\t" // Store c31  into quad and increment by rs_c.
-" st1 {v8.s}[2],[x27],x14                    \n\t" // Store c32  into quad and increment by rs_c.
-" st1 {v8.s}[3],[x27],x14                    \n\t" // Store c33  into quad and increment by rs_c.
-" st1 {v9.s}[0],[x27],x14                    \n\t" // Store c34  into quad and increment by rs_c.
-" st1 {v9.s}[1],[x27],x14                    \n\t" // Store c35  into quad and increment by rs_c.
-" st1 {v9.s}[2],[x27],x14                    \n\t" // Store c36  into quad and increment by rs_c.
-" st1 {v9.s}[3],[x27],x14                    \n\t" // Store c37  into quad and increment by rs_c.
+" st1 {v8.s}[0],[x5],x14                     \n\t" // Store c30  into quad and increment by rs_c.
+" st1 {v8.s}[1],[x5],x14                     \n\t" // Store c31  into quad and increment by rs_c.
+" st1 {v8.s}[2],[x5],x14                     \n\t" // Store c32  into quad and increment by rs_c.
+" st1 {v8.s}[3],[x5],x14                     \n\t" // Store c33  into quad and increment by rs_c.
+" st1 {v9.s}[0],[x5],x14                     \n\t" // Store c34  into quad and increment by rs_c.
+" st1 {v9.s}[1],[x5],x14                     \n\t" // Store c35  into quad and increment by rs_c.
+" st1 {v9.s}[2],[x5],x14                     \n\t" // Store c36  into quad and increment by rs_c.
+" st1 {v9.s}[3],[x5],x14                     \n\t" // Store c37  into quad and increment by rs_c.
 "                                            \n\t"
-" mov x27, x19                               \n\t"
+" mov x5, x20                                \n\t"
 "                                            \n\t"
-" st1 {v10.s}[0],[x27],x14                   \n\t" // Store c40  into quad and increment by rs_c.
-" st1 {v10.s}[1],[x27],x14                   \n\t" // Store c41  into quad and increment by rs_c.
-" st1 {v10.s}[2],[x27],x14                   \n\t" // Store c42  into quad and increment by rs_c.
-" st1 {v10.s}[3],[x27],x14                   \n\t" // Store c43  into quad and increment by rs_c.
-" st1 {v11.s}[0],[x27],x14                   \n\t" // Store c44  into quad and increment by rs_c.
-" st1 {v11.s}[1],[x27],x14                   \n\t" // Store c45  into quad and increment by rs_c.
-" st1 {v11.s}[2],[x27],x14                   \n\t" // Store c46  into quad and increment by rs_c.
-" st1 {v11.s}[3],[x27],x14                   \n\t" // Store c47  into quad and increment by rs_c.
+" st1 {v10.s}[0],[x5],x14                    \n\t" // Store c40  into quad and increment by rs_c.
+" st1 {v10.s}[1],[x5],x14                    \n\t" // Store c41  into quad and increment by rs_c.
+" st1 {v10.s}[2],[x5],x14                    \n\t" // Store c42  into quad and increment by rs_c.
+" st1 {v10.s}[3],[x5],x14                    \n\t" // Store c43  into quad and increment by rs_c.
+" st1 {v11.s}[0],[x5],x14                    \n\t" // Store c44  into quad and increment by rs_c.
+" st1 {v11.s}[1],[x5],x14                    \n\t" // Store c45  into quad and increment by rs_c.
+" st1 {v11.s}[2],[x5],x14                    \n\t" // Store c46  into quad and increment by rs_c.
+" st1 {v11.s}[3],[x5],x14                    \n\t" // Store c47  into quad and increment by rs_c.
 "                                            \n\t"
-" mov x27, x20                               \n\t"
+" mov x5, x21                                \n\t"
 "                                            \n\t"
-" st1 {v12.s}[0],[x27],x14                   \n\t" // Store c50  into quad and increment by rs_c.
-" st1 {v12.s}[1],[x27],x14                   \n\t" // Store c51  into quad and increment by rs_c.
-" st1 {v12.s}[2],[x27],x14                   \n\t" // Store c52  into quad and increment by rs_c.
-" st1 {v12.s}[3],[x27],x14                   \n\t" // Store c53  into quad and increment by rs_c.
-" st1 {v13.s}[0],[x27],x14                   \n\t" // Store c54  into quad and increment by rs_c.
-" st1 {v13.s}[1],[x27],x14                   \n\t" // Store c55  into quad and increment by rs_c.
-" st1 {v13.s}[2],[x27],x14                   \n\t" // Store c56  into quad and increment by rs_c.
-" st1 {v13.s}[3],[x27],x14                   \n\t" // Store c57  into quad and increment by rs_c.
+" st1 {v12.s}[0],[x5],x14                    \n\t" // Store c50  into quad and increment by rs_c.
+" st1 {v12.s}[1],[x5],x14                    \n\t" // Store c51  into quad and increment by rs_c.
+" st1 {v12.s}[2],[x5],x14                    \n\t" // Store c52  into quad and increment by rs_c.
+" st1 {v12.s}[3],[x5],x14                    \n\t" // Store c53  into quad and increment by rs_c.
+" st1 {v13.s}[0],[x5],x14                    \n\t" // Store c54  into quad and increment by rs_c.
+" st1 {v13.s}[1],[x5],x14                    \n\t" // Store c55  into quad and increment by rs_c.
+" st1 {v13.s}[2],[x5],x14                    \n\t" // Store c56  into quad and increment by rs_c.
+" st1 {v13.s}[3],[x5],x14                    \n\t" // Store c57  into quad and increment by rs_c.
 "                                            \n\t"
 " dup  v0.4s, wzr                            \n\t"
 " dup  v1.4s, wzr                            \n\t"
@@ -874,38 +874,38 @@ LABEL(SBETAZEROGENSTOREDS2)
 " fcmp s7,#0.0                               \n\t"
 BEQ(SBETAZEROGENSTOREDS3)                          // Taking care of the beta==0 case.
 "                                            \n\t"
-" mov x27, x21                               \n\t"
+" mov x5, x22                                \n\t"
 "                                            \n\t"
-" ld1 {v0.s}[0],[x27],x14                    \n\t" // Load c60  into quad and increment by rs_c.
-" ld1 {v0.s}[1],[x27],x14                    \n\t" // Load c61  into quad and increment by rs_c.
-" ld1 {v0.s}[2],[x27],x14                    \n\t" // Load c62  into quad and increment by rs_c.
-" ld1 {v0.s}[3],[x27],x14                    \n\t" // Load c63  into quad and increment by rs_c.
-" ld1 {v1.s}[0],[x27],x14                    \n\t" // Load c64  into quad and increment by rs_c.
-" ld1 {v1.s}[1],[x27],x14                    \n\t" // Load c65  into quad and increment by rs_c.
-" ld1 {v1.s}[2],[x27],x14                    \n\t" // Load c66  into quad and increment by rs_c.
-" ld1 {v1.s}[3],[x27],x14                    \n\t" // Load c67  into quad and increment by rs_c.
+" ld1 {v0.s}[0],[x5],x14                     \n\t" // Load c60  into quad and increment by rs_c.
+" ld1 {v0.s}[1],[x5],x14                     \n\t" // Load c61  into quad and increment by rs_c.
+" ld1 {v0.s}[2],[x5],x14                     \n\t" // Load c62  into quad and increment by rs_c.
+" ld1 {v0.s}[3],[x5],x14                     \n\t" // Load c63  into quad and increment by rs_c.
+" ld1 {v1.s}[0],[x5],x14                     \n\t" // Load c64  into quad and increment by rs_c.
+" ld1 {v1.s}[1],[x5],x14                     \n\t" // Load c65  into quad and increment by rs_c.
+" ld1 {v1.s}[2],[x5],x14                     \n\t" // Load c66  into quad and increment by rs_c.
+" ld1 {v1.s}[3],[x5],x14                     \n\t" // Load c67  into quad and increment by rs_c.
 "                                            \n\t"
-" mov x27, x22                               \n\t"
+" mov x5, x23                                \n\t"
 "                                            \n\t"
-" ld1 {v2.s}[0],[x27],x14                    \n\t" // Load c70  into quad and increment by rs_c.
-" ld1 {v2.s}[1],[x27],x14                    \n\t" // Load c71  into quad and increment by rs_c.
-" ld1 {v2.s}[2],[x27],x14                    \n\t" // Load c72  into quad and increment by rs_c.
-" ld1 {v2.s}[3],[x27],x14                    \n\t" // Load c73  into quad and increment by rs_c.
-" ld1 {v3.s}[0],[x27],x14                    \n\t" // Load c74  into quad and increment by rs_c.
-" ld1 {v3.s}[1],[x27],x14                    \n\t" // Load c75  into quad and increment by rs_c.
-" ld1 {v3.s}[2],[x27],x14                    \n\t" // Load c76  into quad and increment by rs_c.
-" ld1 {v3.s}[3],[x27],x14                    \n\t" // Load c77  into quad and increment by rs_c.
+" ld1 {v2.s}[0],[x5],x14                     \n\t" // Load c70  into quad and increment by rs_c.
+" ld1 {v2.s}[1],[x5],x14                     \n\t" // Load c71  into quad and increment by rs_c.
+" ld1 {v2.s}[2],[x5],x14                     \n\t" // Load c72  into quad and increment by rs_c.
+" ld1 {v2.s}[3],[x5],x14                     \n\t" // Load c73  into quad and increment by rs_c.
+" ld1 {v3.s}[0],[x5],x14                     \n\t" // Load c74  into quad and increment by rs_c.
+" ld1 {v3.s}[1],[x5],x14                     \n\t" // Load c75  into quad and increment by rs_c.
+" ld1 {v3.s}[2],[x5],x14                     \n\t" // Load c76  into quad and increment by rs_c.
+" ld1 {v3.s}[3],[x5],x14                     \n\t" // Load c77  into quad and increment by rs_c.
 "                                            \n\t"
-" mov x27, x23                               \n\t"
+" mov x5, x24                                \n\t"
 "                                            \n\t"
-" ld1 {v4.s}[0],[x27],x14                    \n\t" // Load c80  into quad and increment by rs_c.
-" ld1 {v4.s}[1],[x27],x14                    \n\t" // Load c81  into quad and increment by rs_c.
-" ld1 {v4.s}[2],[x27],x14                    \n\t" // Load c82  into quad and increment by rs_c.
-" ld1 {v4.s}[3],[x27],x14                    \n\t" // Load c83  into quad and increment by rs_c.
-" ld1 {v5.s}[0],[x27],x14                    \n\t" // Load c84  into quad and increment by rs_c.
-" ld1 {v5.s}[1],[x27],x14                    \n\t" // Load c85  into quad and increment by rs_c.
-" ld1 {v5.s}[2],[x27],x14                    \n\t" // Load c86  into quad and increment by rs_c.
-" ld1 {v5.s}[3],[x27],x14                    \n\t" // Load c87  into quad and increment by rs_c.
+" ld1 {v4.s}[0],[x5],x14                     \n\t" // Load c80  into quad and increment by rs_c.
+" ld1 {v4.s}[1],[x5],x14                     \n\t" // Load c81  into quad and increment by rs_c.
+" ld1 {v4.s}[2],[x5],x14                     \n\t" // Load c82  into quad and increment by rs_c.
+" ld1 {v4.s}[3],[x5],x14                     \n\t" // Load c83  into quad and increment by rs_c.
+" ld1 {v5.s}[0],[x5],x14                     \n\t" // Load c84  into quad and increment by rs_c.
+" ld1 {v5.s}[1],[x5],x14                     \n\t" // Load c85  into quad and increment by rs_c.
+" ld1 {v5.s}[2],[x5],x14                     \n\t" // Load c86  into quad and increment by rs_c.
+" ld1 {v5.s}[3],[x5],x14                     \n\t" // Load c87  into quad and increment by rs_c.
 "                                            \n\t"
 " fmul v0.4s,v0.4s,v7.s[0]                   \n\t" // Scale by beta
 " fmul v1.4s,v1.4s,v7.s[0]                   \n\t" // Scale by beta
@@ -923,38 +923,38 @@ LABEL(SBETAZEROGENSTOREDS3)
 " fmla v4.4s,v24.4s,v6.s[0]                  \n\t" // Scale by alpha
 " fmla v5.4s,v25.4s,v6.s[0]                  \n\t" // Scale by alpha
 "                                            \n\t"
-" mov x27, x21                               \n\t"
+" mov x5, x22                                \n\t"
 "                                            \n\t"
-" st1 {v0.s}[0],[x27],x14                    \n\t" // Store c60  into quad and increment by rs_c.
-" st1 {v0.s}[1],[x27],x14                    \n\t" // Store c61  into quad and increment by rs_c.
-" st1 {v0.s}[2],[x27],x14                    \n\t" // Store c62  into quad and increment by rs_c.
-" st1 {v0.s}[3],[x27],x14                    \n\t" // Store c63  into quad and increment by rs_c.
-" st1 {v1.s}[0],[x27],x14                    \n\t" // Store c64  into quad and increment by rs_c.
-" st1 {v1.s}[1],[x27],x14                    \n\t" // Store c65  into quad and increment by rs_c.
-" st1 {v1.s}[2],[x27],x14                    \n\t" // Store c66  into quad and increment by rs_c.
-" st1 {v1.s}[3],[x27],x14                    \n\t" // Store c67  into quad and increment by rs_c.
+" st1 {v0.s}[0],[x5],x14                     \n\t" // Store c60  into quad and increment by rs_c.
+" st1 {v0.s}[1],[x5],x14                     \n\t" // Store c61  into quad and increment by rs_c.
+" st1 {v0.s}[2],[x5],x14                     \n\t" // Store c62  into quad and increment by rs_c.
+" st1 {v0.s}[3],[x5],x14                     \n\t" // Store c63  into quad and increment by rs_c.
+" st1 {v1.s}[0],[x5],x14                     \n\t" // Store c64  into quad and increment by rs_c.
+" st1 {v1.s}[1],[x5],x14                     \n\t" // Store c65  into quad and increment by rs_c.
+" st1 {v1.s}[2],[x5],x14                     \n\t" // Store c66  into quad and increment by rs_c.
+" st1 {v1.s}[3],[x5],x14                     \n\t" // Store c67  into quad and increment by rs_c.
 "                                            \n\t"
-" mov x27, x22                               \n\t"
+" mov x5, x23                                \n\t"
 "                                            \n\t"
-" st1 {v2.s}[0],[x27],x14                    \n\t" // Store c70  into quad and increment by rs_c.
-" st1 {v2.s}[1],[x27],x14                    \n\t" // Store c71  into quad and increment by rs_c.
-" st1 {v2.s}[2],[x27],x14                    \n\t" // Store c72  into quad and increment by rs_c.
-" st1 {v2.s}[3],[x27],x14                    \n\t" // Store c73  into quad and increment by rs_c.
-" st1 {v3.s}[0],[x27],x14                    \n\t" // Store c74  into quad and increment by rs_c.
-" st1 {v3.s}[1],[x27],x14                    \n\t" // Store c75  into quad and increment by rs_c.
-" st1 {v3.s}[2],[x27],x14                    \n\t" // Store c76  into quad and increment by rs_c.
-" st1 {v3.s}[3],[x27],x14                    \n\t" // Store c77  into quad and increment by rs_c.
+" st1 {v2.s}[0],[x5],x14                     \n\t" // Store c70  into quad and increment by rs_c.
+" st1 {v2.s}[1],[x5],x14                     \n\t" // Store c71  into quad and increment by rs_c.
+" st1 {v2.s}[2],[x5],x14                     \n\t" // Store c72  into quad and increment by rs_c.
+" st1 {v2.s}[3],[x5],x14                     \n\t" // Store c73  into quad and increment by rs_c.
+" st1 {v3.s}[0],[x5],x14                     \n\t" // Store c74  into quad and increment by rs_c.
+" st1 {v3.s}[1],[x5],x14                     \n\t" // Store c75  into quad and increment by rs_c.
+" st1 {v3.s}[2],[x5],x14                     \n\t" // Store c76  into quad and increment by rs_c.
+" st1 {v3.s}[3],[x5],x14                     \n\t" // Store c77  into quad and increment by rs_c.
 "                                            \n\t"
-" mov x27, x23                               \n\t"
+" mov x5, x24                                \n\t"
 "                                            \n\t"
-" st1 {v4.s}[0],[x27],x14                    \n\t" // Store c80  into quad and increment by rs_c.
-" st1 {v4.s}[1],[x27],x14                    \n\t" // Store c81  into quad and increment by rs_c.
-" st1 {v4.s}[2],[x27],x14                    \n\t" // Store c82  into quad and increment by rs_c.
-" st1 {v4.s}[3],[x27],x14                    \n\t" // Store c83  into quad and increment by rs_c.
-" st1 {v5.s}[0],[x27],x14                    \n\t" // Store c84  into quad and increment by rs_c.
-" st1 {v5.s}[1],[x27],x14                    \n\t" // Store c85  into quad and increment by rs_c.
-" st1 {v5.s}[2],[x27],x14                    \n\t" // Store c86  into quad and increment by rs_c.
-" st1 {v5.s}[3],[x27],x14                    \n\t" // Store c87  into quad and increment by rs_c.
+" st1 {v4.s}[0],[x5],x14                     \n\t" // Store c80  into quad and increment by rs_c.
+" st1 {v4.s}[1],[x5],x14                     \n\t" // Store c81  into quad and increment by rs_c.
+" st1 {v4.s}[2],[x5],x14                     \n\t" // Store c82  into quad and increment by rs_c.
+" st1 {v4.s}[3],[x5],x14                     \n\t" // Store c83  into quad and increment by rs_c.
+" st1 {v5.s}[0],[x5],x14                     \n\t" // Store c84  into quad and increment by rs_c.
+" st1 {v5.s}[1],[x5],x14                     \n\t" // Store c85  into quad and increment by rs_c.
+" st1 {v5.s}[2],[x5],x14                     \n\t" // Store c86  into quad and increment by rs_c.
+" st1 {v5.s}[3],[x5],x14                     \n\t" // Store c87  into quad and increment by rs_c.
 "                                            \n\t"
 " dup  v8.4s, wzr                            \n\t"
 " dup  v9.4s, wzr                            \n\t"
@@ -966,38 +966,38 @@ LABEL(SBETAZEROGENSTOREDS3)
 " fcmp s7,#0.0                               \n\t"
 BEQ(SBETAZEROGENSTOREDS4)                          // Taking care of the beta==0 case.
 "                                            \n\t"
-" mov x27, x24                               \n\t"
+" mov x5, x25                                \n\t"
 "                                            \n\t"
-" ld1 {v8.s}[0],[x27],x14                    \n\t" // Load c90  into quad and increment by rs_c.
-" ld1 {v8.s}[1],[x27],x14                    \n\t" // Load c91  into quad and increment by rs_c.
-" ld1 {v8.s}[2],[x27],x14                    \n\t" // Load c92  into quad and increment by rs_c.
-" ld1 {v8.s}[3],[x27],x14                    \n\t" // Load c93  into quad and increment by rs_c.
-" ld1 {v9.s}[0],[x27],x14                    \n\t" // Load c94  into quad and increment by rs_c.
-" ld1 {v9.s}[1],[x27],x14                    \n\t" // Load c95  into quad and increment by rs_c.
-" ld1 {v9.s}[2],[x27],x14                    \n\t" // Load c96  into quad and increment by rs_c.
-" ld1 {v9.s}[3],[x27],x14                    \n\t" // Load c97  into quad and increment by rs_c.
+" ld1 {v8.s}[0],[x5],x14                     \n\t" // Load c90  into quad and increment by rs_c.
+" ld1 {v8.s}[1],[x5],x14                     \n\t" // Load c91  into quad and increment by rs_c.
+" ld1 {v8.s}[2],[x5],x14                     \n\t" // Load c92  into quad and increment by rs_c.
+" ld1 {v8.s}[3],[x5],x14                     \n\t" // Load c93  into quad and increment by rs_c.
+" ld1 {v9.s}[0],[x5],x14                     \n\t" // Load c94  into quad and increment by rs_c.
+" ld1 {v9.s}[1],[x5],x14                     \n\t" // Load c95  into quad and increment by rs_c.
+" ld1 {v9.s}[2],[x5],x14                     \n\t" // Load c96  into quad and increment by rs_c.
+" ld1 {v9.s}[3],[x5],x14                     \n\t" // Load c97  into quad and increment by rs_c.
 "                                            \n\t"
-" mov x27, x25                               \n\t"
+" mov x5, x26                                \n\t"
 "                                            \n\t"
-" ld1 {v10.s}[0],[x27],x14                   \n\t" // Load c100  into quad and increment by rs_c.
-" ld1 {v10.s}[1],[x27],x14                   \n\t" // Load c101  into quad and increment by rs_c.
-" ld1 {v10.s}[2],[x27],x14                   \n\t" // Load c102  into quad and increment by rs_c.
-" ld1 {v10.s}[3],[x27],x14                   \n\t" // Load c103  into quad and increment by rs_c.
-" ld1 {v11.s}[0],[x27],x14                   \n\t" // Load c104  into quad and increment by rs_c.
-" ld1 {v11.s}[1],[x27],x14                   \n\t" // Load c105  into quad and increment by rs_c.
-" ld1 {v11.s}[2],[x27],x14                   \n\t" // Load c106  into quad and increment by rs_c.
-" ld1 {v11.s}[3],[x27],x14                   \n\t" // Load c107  into quad and increment by rs_c.
+" ld1 {v10.s}[0],[x5],x14                    \n\t" // Load c100  into quad and increment by rs_c.
+" ld1 {v10.s}[1],[x5],x14                    \n\t" // Load c101  into quad and increment by rs_c.
+" ld1 {v10.s}[2],[x5],x14                    \n\t" // Load c102  into quad and increment by rs_c.
+" ld1 {v10.s}[3],[x5],x14                    \n\t" // Load c103  into quad and increment by rs_c.
+" ld1 {v11.s}[0],[x5],x14                    \n\t" // Load c104  into quad and increment by rs_c.
+" ld1 {v11.s}[1],[x5],x14                    \n\t" // Load c105  into quad and increment by rs_c.
+" ld1 {v11.s}[2],[x5],x14                    \n\t" // Load c106  into quad and increment by rs_c.
+" ld1 {v11.s}[3],[x5],x14                    \n\t" // Load c107  into quad and increment by rs_c.
 "                                            \n\t"
-" mov x27, x26                               \n\t"
+" mov x5, x27                                \n\t"
 "                                            \n\t"
-" ld1 {v12.s}[0],[x27],x14                   \n\t" // Load c110  into quad and increment by rs_c.
-" ld1 {v12.s}[1],[x27],x14                   \n\t" // Load c111  into quad and increment by rs_c.
-" ld1 {v12.s}[2],[x27],x14                   \n\t" // Load c112  into quad and increment by rs_c.
-" ld1 {v12.s}[3],[x27],x14                   \n\t" // Load c113  into quad and increment by rs_c.
-" ld1 {v13.s}[0],[x27],x14                   \n\t" // Load c114  into quad and increment by rs_c.
-" ld1 {v13.s}[1],[x27],x14                   \n\t" // Load c115  into quad and increment by rs_c.
-" ld1 {v13.s}[2],[x27],x14                   \n\t" // Load c116  into quad and increment by rs_c.
-" ld1 {v13.s}[3],[x27],x14                   \n\t" // Load c117  into quad and increment by rs_c.
+" ld1 {v12.s}[0],[x5],x14                    \n\t" // Load c110  into quad and increment by rs_c.
+" ld1 {v12.s}[1],[x5],x14                    \n\t" // Load c111  into quad and increment by rs_c.
+" ld1 {v12.s}[2],[x5],x14                    \n\t" // Load c112  into quad and increment by rs_c.
+" ld1 {v12.s}[3],[x5],x14                    \n\t" // Load c113  into quad and increment by rs_c.
+" ld1 {v13.s}[0],[x5],x14                    \n\t" // Load c114  into quad and increment by rs_c.
+" ld1 {v13.s}[1],[x5],x14                    \n\t" // Load c115  into quad and increment by rs_c.
+" ld1 {v13.s}[2],[x5],x14                    \n\t" // Load c116  into quad and increment by rs_c.
+" ld1 {v13.s}[3],[x5],x14                    \n\t" // Load c117  into quad and increment by rs_c.
 "                                            \n\t"
 " fmul v8.4s, v8.4s, v7.s[0]                 \n\t" // Scale by beta
 " fmul v9.4s, v9.4s, v7.s[0]                 \n\t" // Scale by beta
@@ -1018,38 +1018,38 @@ LABEL(SBETAZEROGENSTOREDS4)
 " fmla v12.4s,v30.4s,v6.s[0]                 \n\t" // Scale by alpha
 " fmla v13.4s,v31.4s,v6.s[0]                 \n\t" // Scale by alpha
 "                                            \n\t"
-" mov x27, x24                               \n\t"
+" mov x5, x25                                \n\t"
 "                                            \n\t"
-" st1 {v8.s}[0],[x27],x14                    \n\t" // Store c90  into quad and increment by rs_c.
-" st1 {v8.s}[1],[x27],x14                    \n\t" // Store c91  into quad and increment by rs_c.
-" st1 {v8.s}[2],[x27],x14                    \n\t" // Store c92  into quad and increment by rs_c.
-" st1 {v8.s}[3],[x27],x14                    \n\t" // Store c93  into quad and increment by rs_c.
-" st1 {v9.s}[0],[x27],x14                    \n\t" // Store c94  into quad and increment by rs_c.
-" st1 {v9.s}[1],[x27],x14                    \n\t" // Store c95  into quad and increment by rs_c.
-" st1 {v9.s}[2],[x27],x14                    \n\t" // Store c96  into quad and increment by rs_c.
-" st1 {v9.s}[3],[x27],x14                    \n\t" // Store c97  into quad and increment by rs_c.
+" st1 {v8.s}[0],[x5],x14                     \n\t" // Store c90  into quad and increment by rs_c.
+" st1 {v8.s}[1],[x5],x14                     \n\t" // Store c91  into quad and increment by rs_c.
+" st1 {v8.s}[2],[x5],x14                     \n\t" // Store c92  into quad and increment by rs_c.
+" st1 {v8.s}[3],[x5],x14                     \n\t" // Store c93  into quad and increment by rs_c.
+" st1 {v9.s}[0],[x5],x14                     \n\t" // Store c94  into quad and increment by rs_c.
+" st1 {v9.s}[1],[x5],x14                     \n\t" // Store c95  into quad and increment by rs_c.
+" st1 {v9.s}[2],[x5],x14                     \n\t" // Store c96  into quad and increment by rs_c.
+" st1 {v9.s}[3],[x5],x14                     \n\t" // Store c97  into quad and increment by rs_c.
 "                                            \n\t"
-" mov x27, x25                               \n\t"
+" mov x5, x26                                \n\t"
 "                                            \n\t"
-" st1 {v10.s}[0],[x27],x14                   \n\t" // Store c100  into quad and increment by rs_c.
-" st1 {v10.s}[1],[x27],x14                   \n\t" // Store c101  into quad and increment by rs_c.
-" st1 {v10.s}[2],[x27],x14                   \n\t" // Store c102  into quad and increment by rs_c.
-" st1 {v10.s}[3],[x27],x14                   \n\t" // Store c103  into quad and increment by rs_c.
-" st1 {v11.s}[0],[x27],x14                   \n\t" // Store c104  into quad and increment by rs_c.
-" st1 {v11.s}[1],[x27],x14                   \n\t" // Store c105  into quad and increment by rs_c.
-" st1 {v11.s}[2],[x27],x14                   \n\t" // Store c106  into quad and increment by rs_c.
-" st1 {v11.s}[3],[x27],x14                   \n\t" // Store c107  into quad and increment by rs_c.
+" st1 {v10.s}[0],[x5],x14                    \n\t" // Store c100  into quad and increment by rs_c.
+" st1 {v10.s}[1],[x5],x14                    \n\t" // Store c101  into quad and increment by rs_c.
+" st1 {v10.s}[2],[x5],x14                    \n\t" // Store c102  into quad and increment by rs_c.
+" st1 {v10.s}[3],[x5],x14                    \n\t" // Store c103  into quad and increment by rs_c.
+" st1 {v11.s}[0],[x5],x14                    \n\t" // Store c104  into quad and increment by rs_c.
+" st1 {v11.s}[1],[x5],x14                    \n\t" // Store c105  into quad and increment by rs_c.
+" st1 {v11.s}[2],[x5],x14                    \n\t" // Store c106  into quad and increment by rs_c.
+" st1 {v11.s}[3],[x5],x14                    \n\t" // Store c107  into quad and increment by rs_c.
 "                                            \n\t"
-" mov x27, x26                               \n\t"
+" mov x5, x27                                \n\t"
 "                                            \n\t"
-" st1 {v12.s}[0],[x27],x14                   \n\t" // Store c110  into quad and increment by rs_c.
-" st1 {v12.s}[1],[x27],x14                   \n\t" // Store c111  into quad and increment by rs_c.
-" st1 {v12.s}[2],[x27],x14                   \n\t" // Store c112  into quad and increment by rs_c.
-" st1 {v12.s}[3],[x27],x14                   \n\t" // Store c113  into quad and increment by rs_c.
-" st1 {v13.s}[0],[x27],x14                   \n\t" // Store c114  into quad and increment by rs_c.
-" st1 {v13.s}[1],[x27],x14                   \n\t" // Store c115  into quad and increment by rs_c.
-" st1 {v13.s}[2],[x27],x14                   \n\t" // Store c116  into quad and increment by rs_c.
-" st1 {v13.s}[3],[x27],x14                   \n\t" // Store c147  into quad and increment by rs_c.
+" st1 {v12.s}[0],[x5],x14                    \n\t" // Store c110  into quad and increment by rs_c.
+" st1 {v12.s}[1],[x5],x14                    \n\t" // Store c111  into quad and increment by rs_c.
+" st1 {v12.s}[2],[x5],x14                    \n\t" // Store c112  into quad and increment by rs_c.
+" st1 {v12.s}[3],[x5],x14                    \n\t" // Store c113  into quad and increment by rs_c.
+" st1 {v13.s}[0],[x5],x14                    \n\t" // Store c114  into quad and increment by rs_c.
+" st1 {v13.s}[1],[x5],x14                    \n\t" // Store c115  into quad and increment by rs_c.
+" st1 {v13.s}[2],[x5],x14                    \n\t" // Store c116  into quad and increment by rs_c.
+" st1 {v13.s}[3],[x5],x14                    \n\t" // Store c147  into quad and increment by rs_c.
 "                                            \n\t"
 LABEL(SEND)                                        // Done!
 "                                            \n\t"
@@ -1068,11 +1068,10 @@ LABEL(SEND)                                        // Done!
 [b_next] "m" (b_next) // 10
 :// Register clobber list
 "x0", "x1", "x2",
- "x5", "x6", "x10",
- "x14",
- "x16","x17","x18","x19",       
- "x20","x21","x22","x23",
- "x24","x25","x26","x27",
+ "x5", "x6", "x10","x14",
+ "x16","x17","x19","x20",
+ "x21","x22","x23","x24",
+ "x25","x26","x27",
 "v0", "v1", "v2", "v3",
 "v4", "v5", "v6", "v7",
 "v8", "v9", "v10","v11",