From 7fabd896af773623ed01820a71bbff432e8a7d25 Mon Sep 17 00:00:00 2001 From: RuQing Xu Date: Sat, 29 May 2021 16:28:03 +0900 Subject: [PATCH 1/4] Asm Flag Mingling for Darwin_Aarch64 Apple+Arm64 requires additional "tagging" of local symbols. --- kernels/armv8a/3/armv8a_asm_utils.h | 49 ++++++++ kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c | 125 ++++++++++---------- 2 files changed, 112 insertions(+), 62 deletions(-) create mode 100644 kernels/armv8a/3/armv8a_asm_utils.h diff --git a/kernels/armv8a/3/armv8a_asm_utils.h b/kernels/armv8a/3/armv8a_asm_utils.h new file mode 100644 index 000000000..7bf97d555 --- /dev/null +++ b/kernels/armv8a/3/armv8a_asm_utils.h @@ -0,0 +1,49 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2021, The University of Tokyo + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +*/ + +// Apple's local label requirements. +#if defined(__APPLE__) +#define LABEL(str) " L" #str": \n\t" +#define BEQ(str) "b.eq L" #str" \n\t" +#define BNE(str) "b.ne L" #str" \n\t" +#define BRANCH(str) "b L" #str" \n\t" +#else +#define LABEL(str) " ." #str": \n\t" +#define BEQ(str) "b.eq ." #str" \n\t" +#define BNE(str) "b.ne ." #str" \n\t" +#define BRANCH(str) "b ." #str" \n\t" +#endif + diff --git a/kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c b/kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c index c01c67f5a..251931f7c 100644 --- a/kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c +++ b/kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c @@ -34,6 +34,7 @@ */ #include "blis.h" +#include "armv8a_asm_utils.h" /* o 4x4 Single precision micro-kernel fully functional. @@ -155,7 +156,7 @@ __asm__ volatile " dup v31.4s, wzr \n\t" // Vector for accummulating column 11 " \n\t" " cmp x5,#0 \n\t" // If k_iter == 0, jump to k_left. -" beq .SCONSIDERKLEFT \n\t" +BEQ(SCONSIDERKLEFT) " \n\t" " ldr q0, [x0] \n\t" " ldr q1, [x0, #16] \n\t" // Load a @@ -168,9 +169,9 @@ __asm__ volatile " add x1, x1, #48 \n\t" //update address of B " \n\t" " cmp x5,1 \n\t" // If there is just one k_iter, jump to that one. -" beq .SLASTITER \n\t" // (as loop is do-while-like). +BEQ(SLASTITER) // (as loop is do-while-like). " \n\t" -" .SLOOPKITER: \n\t" // Body of the k_iter loop. +LABEL(SLOOPKITER) // Body of the k_iter loop. " \n\t" " ldr q5, [x0] \n\t" " fmla v8.4s, v0.4s,v2.s[0] \n\t" // Accummulate. @@ -316,9 +317,9 @@ __asm__ volatile " \n\t" //End It 4 " sub x5,x5,1 \n\t" // i-=1. " cmp x5,1 \n\t" // Iterate again if we are not in k_iter == 1. -" bne .SLOOPKITER \n\t" +BNE(SLOOPKITER) " \n\t" -" .SLASTITER: \n\t" // Last iteration of k_iter loop. +LABEL(SLASTITER) // Last iteration of k_iter loop. " \n\t" " \n\t" " ldr q5, [x0] \n\t" @@ -454,11 +455,11 @@ __asm__ volatile " add x0, x0, #96 \n\t" " \n\t" //End It 4 " \n\t" -" .SCONSIDERKLEFT: \n\t" +LABEL(SCONSIDERKLEFT) " cmp x6,0 \n\t" // If k_left == 0, we are done. -" beq .SPOSTACCUM \n\t" // else, we enter the k_left loop. +BEQ(SPOSTACCUM) // else, we enter the k_left loop. " \n\t" -" .SLOOPKLEFT: \n\t" // Body of the left iterations +LABEL(SLOOPKLEFT) // Body of the left iterations " \n\t" " ldr q0, [x0],#16 \n\t" " ldr q1, [x0],#16 \n\t" // Load a @@ -497,17 +498,17 @@ __asm__ volatile " fmla v31.4s,v1.4s,v4.s[3] \n\t" // Accummulate. " \n\t" " cmp x6,0 \n\t" // Iterate again. -" bne .SLOOPKLEFT \n\t" // if i!=0. +BNE(SLOOPKLEFT) // if i!=0. " \n\t" -" .SPOSTACCUM: \n\t" +LABEL(SPOSTACCUM) " \n\t" " ld1r {v6.4s},[x7] \n\t" // Load alpha. " ld1r {v7.4s},[x8] \n\t" // Load beta " \n\t" " cmp x13,#1 \n\t" // If rs_c != 1 (column-major) -" bne .SGENSTORED \n\t" +BNE(SGENSTORED) " \n\t" -" .SCOLSTORED: \n\t" // C is column-major. +LABEL(SCOLSTORED) // C is column-major. " \n\t" " dup v0.4s, wzr \n\t" " dup v1.4s, wzr \n\t" @@ -517,7 +518,7 @@ __asm__ volatile " dup v5.4s, wzr \n\t" " \n\t" " fcmp s7,#0.0 \n\t" -" beq .SBETAZEROCOLSTOREDS1 \n\t" // Taking care of the beta==0 case. +BEQ(SBETAZEROCOLSTOREDS1) // Taking care of the beta==0 case. " \n\t" " ldr q0, [x2] \n\t" //Load column 0 of C " ldr q1, [x2, #16] \n\t" @@ -533,7 +534,7 @@ __asm__ volatile " fmul v4.4s,v4.4s,v7.s[0] \n\t" // Scale by beta " fmul v5.4s,v5.4s,v7.s[0] \n\t" // Scale by beta " \n\t" -" .SBETAZEROCOLSTOREDS1: \n\t" +LABEL(SBETAZEROCOLSTOREDS1) " \n\t" " fmla v0.4s,v8.4s,v6.s[0] \n\t" // Scale by alpha " fmla v1.4s,v9.4s,v6.s[0] \n\t" // Scale by alpha @@ -557,7 +558,7 @@ __asm__ volatile " dup v13.4s, wzr \n\t" " \n\t" " fcmp s7,#0.0 \n\t" -" beq .SBETAZEROCOLSTOREDS2 \n\t" // Taking care of the beta==0 case. +BEQ(SBETAZEROCOLSTOREDS2) // Taking care of the beta==0 case. " \n\t" " ldr q8, [x18] \n\t" //Load column 3 of C " ldr q9, [x18, #16] \n\t" @@ -573,7 +574,7 @@ __asm__ volatile " fmul v12.4s,v12.4s,v7.s[0] \n\t" // Scale by beta " fmul v13.4s,v13.4s,v7.s[0] \n\t" // Scale by beta " \n\t" -" .SBETAZEROCOLSTOREDS2: \n\t" +LABEL(SBETAZEROCOLSTOREDS2) " \n\t" " fmla v8.4s, v14.4s,v6.s[0] \n\t" // Scale by alpha " fmla v9.4s, v15.4s,v6.s[0] \n\t" // Scale by alpha @@ -597,7 +598,7 @@ __asm__ volatile " dup v5.4s, wzr \n\t" " \n\t" " fcmp s7,#0.0 \n\t" -" beq .SBETAZEROCOLSTOREDS3 \n\t" // Taking care of the beta==0 case. +BEQ(SBETAZEROCOLSTOREDS3) // Taking care of the beta==0 case. " \n\t" " ldr q0, [x21] \n\t" //Load column 6 of C " ldr q1, [x21, #16] \n\t" @@ -613,7 +614,7 @@ __asm__ volatile " fmul v4.4s,v4.4s,v7.s[0] \n\t" // Scale by beta " fmul v5.4s,v5.4s,v7.s[0] \n\t" // Scale by beta " \n\t" -" .SBETAZEROCOLSTOREDS3: \n\t" +LABEL(SBETAZEROCOLSTOREDS3) " \n\t" " fmla v0.4s,v20.4s,v6.s[0] \n\t" // Scale by alpha " fmla v1.4s,v21.4s,v6.s[0] \n\t" // Scale by alpha @@ -637,7 +638,7 @@ __asm__ volatile " dup v13.4s, wzr \n\t" " \n\t" " fcmp s7,#0.0 \n\t" -" beq .SBETAZEROCOLSTOREDS4 \n\t" // Taking care of the beta==0 case. +BEQ(SBETAZEROCOLSTOREDS4) // Taking care of the beta==0 case. " \n\t" " ldr q8, [x24] \n\t" //Load column 9 of C " ldr q9, [x24, #16] \n\t" @@ -653,7 +654,7 @@ __asm__ volatile " fmul v12.4s,v12.4s,v7.s[0] \n\t" // Scale by beta " fmul v13.4s,v13.4s,v7.s[0] \n\t" // Scale by beta " \n\t" -" .SBETAZEROCOLSTOREDS4: \n\t" +LABEL(SBETAZEROCOLSTOREDS4) " \n\t" " prfm pldl2keep,[x3] \n\t" " prfm pldl2keep,[x4] \n\t" @@ -673,10 +674,10 @@ __asm__ volatile " str q13, [x26, #16] \n\t" " \n\t" " \n\t" -" b .SEND \n\t" // Done (TODO: this obviously needs to be moved down to remove jump). +BRANCH(SEND) // Done. " \n\t" " \n\t" -" .SGENSTORED: \n\t" // C is general-stride stored. +LABEL(SGENSTORED) // C is general-stride stored. " \n\t" " \n\t" " dup v0.4s, wzr \n\t" @@ -687,7 +688,7 @@ __asm__ volatile " dup v5.4s, wzr \n\t" " \n\t" " fcmp s7,#0.0 \n\t" -" beq .SBETAZEROGENSTOREDS1 \n\t" // Taking care of the beta==0 case. +BEQ(SBETAZEROGENSTOREDS1) // Taking care of the beta==0 case. " \n\t" " mov x27, x2 \n\t" " \n\t" @@ -729,7 +730,7 @@ __asm__ volatile " fmul v4.4s,v4.4s,v7.s[0] \n\t" // Scale by beta " fmul v5.4s,v5.4s,v7.s[0] \n\t" // Scale by beta " \n\t" -" .SBETAZEROGENSTOREDS1: \n\t" +LABEL(SBETAZEROGENSTOREDS1) " \n\t" " fmla v0.4s, v8.4s,v6.s[0] \n\t" // Scale by alpha " fmla v1.4s, v9.4s,v6.s[0] \n\t" // Scale by alpha @@ -779,7 +780,7 @@ __asm__ volatile " dup v13.4s, wzr \n\t" " \n\t" " fcmp s7,#0.0 \n\t" -" beq .SBETAZEROGENSTOREDS2 \n\t" // Taking care of the beta==0 case. +BEQ(SBETAZEROGENSTOREDS2) // Taking care of the beta==0 case. " \n\t" " mov x27, x18 \n\t" " \n\t" @@ -821,7 +822,7 @@ __asm__ volatile " fmul v12.4s,v12.4s,v7.s[0] \n\t" // Scale by beta " fmul v13.4s,v13.4s,v7.s[0] \n\t" // Scale by beta " \n\t" -" .SBETAZEROGENSTOREDS2: \n\t" +LABEL(SBETAZEROGENSTOREDS2) " \n\t" " fmla v8.4s, v14.4s,v6.s[0] \n\t" // Scale by alpha " fmla v9.4s, v15.4s,v6.s[0] \n\t" // Scale by alpha @@ -871,7 +872,7 @@ __asm__ volatile " dup v5.4s, wzr \n\t" " \n\t" " fcmp s7,#0.0 \n\t" -" beq .SBETAZEROGENSTOREDS3 \n\t" // Taking care of the beta==0 case. +BEQ(SBETAZEROGENSTOREDS3) // Taking care of the beta==0 case. " \n\t" " mov x27, x21 \n\t" " \n\t" @@ -913,7 +914,7 @@ __asm__ volatile " fmul v4.4s,v4.4s,v7.s[0] \n\t" // Scale by beta " fmul v5.4s,v5.4s,v7.s[0] \n\t" // Scale by beta " \n\t" -" .SBETAZEROGENSTOREDS3: \n\t" +LABEL(SBETAZEROGENSTOREDS3) " \n\t" " fmla v0.4s,v20.4s,v6.s[0] \n\t" // Scale by alpha " fmla v1.4s,v21.4s,v6.s[0] \n\t" // Scale by alpha @@ -963,7 +964,7 @@ __asm__ volatile " dup v13.4s, wzr \n\t" " \n\t" " fcmp s7,#0.0 \n\t" -" beq .SBETAZEROGENSTOREDS4 \n\t" // Taking care of the beta==0 case. +BEQ(SBETAZEROGENSTOREDS4) // Taking care of the beta==0 case. " \n\t" " mov x27, x24 \n\t" " \n\t" @@ -1005,7 +1006,7 @@ __asm__ volatile " fmul v12.4s,v12.4s,v7.s[0] \n\t" // Scale by beta " fmul v13.4s,v13.4s,v7.s[0] \n\t" // Scale by beta " \n\t" -" .SBETAZEROGENSTOREDS4: \n\t" +LABEL(SBETAZEROGENSTOREDS4) " \n\t" " prfm pldl2keep,[x3] \n\t" " prfm pldl2keep,[x4] \n\t" @@ -1050,7 +1051,7 @@ __asm__ volatile " st1 {v13.s}[2],[x27],x14 \n\t" // Store c116 into quad and increment by rs_c. " st1 {v13.s}[3],[x27],x14 \n\t" // Store c147 into quad and increment by rs_c. " \n\t" -" .SEND: \n\t" // Done! +LABEL(SEND) // Done! " \n\t" :// output operands (none) :// input operands @@ -1203,7 +1204,7 @@ __asm__ volatile " \n\t" " \n\t" " cmp x5,#0 \n\t" // If k_iter == 0, jump to k_left. -" beq .DCONSIDERKLEFT \n\t" +BEQ(DCONSIDERKLEFT) " \n\t" " ldr q0, [x0] \n\t" // Load a " ldr q1, [x0, #16] \n\t" @@ -1218,9 +1219,9 @@ __asm__ volatile " add x1, x1, #64 \n\t" //update address of B " \n\t" " cmp x5,1 \n\t" // If there is just one k_iter, jump to that one. -" beq .DLASTITER \n\t" // (as loop is do-while-like). +BEQ(DLASTITER) // (as loop is do-while-like). " \n\t" -" DLOOP: \n\t" // Body +LABEL(DLOOP) // Body " \n\t" " fmla v8.2d ,v0.2d,v3.d[0] \n\t" // Accummulate " prfm PLDL1KEEP, [x1, #448] \n\t" //512-64=448 @@ -1394,9 +1395,9 @@ __asm__ volatile " \n\t" " sub x5,x5,1 \n\t" // i-=1 " cmp x5,1 \n\t" // Iterate again if we are not in k_iter == 1. -" bne DLOOP \n\t" +BNE(DLOOP) " \n\t" -".DLASTITER: \n\t" +LABEL(DLASTITER) " \n\t" " fmla v8.2d ,v0.2d,v3.d[0] \n\t" // Accummulate " fmla v9.2d ,v1.2d,v3.d[0] \n\t" // Accummulate @@ -1554,11 +1555,11 @@ __asm__ volatile " \n\t" //End it 4 " add x0, x0, #144 \n\t" " \n\t" -" .DCONSIDERKLEFT: \n\t" +LABEL(DCONSIDERKLEFT) " cmp x6,0 \n\t" // If k_left == 0, we are done. -" beq .DPOSTACCUM \n\t" // else, we enter the k_left loop. +BEQ(DPOSTACCUM) // else, we enter the k_left loop. " \n\t" -".DLOOPKLEFT: \n\t" +LABEL(DLOOPKLEFT) " \n\t" " ldr q0, [x0],#16 \n\t" " ldr q1, [x0],#16 \n\t" // Load a @@ -1605,17 +1606,17 @@ __asm__ volatile " fmla v31.2d,v2.2d,v6.d[1] \n\t" // Accummulate " \n\t" " cmp x6,0 \n\t" // Iterate again. -" bne .DLOOPKLEFT \n\t" // if i!=0. +BNE(DLOOPKLEFT) // if i!=0. " \n\t" -" .DPOSTACCUM: \n\t" +LABEL(DPOSTACCUM) " \n\t" " ld1r {v6.2d},[x7] \n\t" // Load alpha. " ld1r {v7.2d},[x8] \n\t" // Load beta " \n\t" " cmp x13,#1 \n\t" // If rs_c != 1 (column-major) -" bne .DGENSTORED \n\t" +BNE(DGENSTORED) " \n\t" -" .DCOLSTORED: \n\t" // C is column-major. +LABEL(DCOLSTORED) // C is column-major. " \n\t" " dup v0.2d, xzr \n\t" " dup v1.2d, xzr \n\t" @@ -1625,7 +1626,7 @@ __asm__ volatile " dup v5.2d, xzr \n\t" " \n\t" " fcmp d7,#0.0 \n\t" -" beq .DBETAZEROCOLSTOREDS1 \n\t" // Taking care of the beta==0 case. +BEQ(DBETAZEROCOLSTOREDS1) // Taking care of the beta==0 case. " \n\t" " ldr q0, [x2] \n\t" //Load column 0 of C " ldr q1, [x2, #16] \n\t" @@ -1642,7 +1643,7 @@ __asm__ volatile " fmul v4.2d,v4.2d,v7.d[0] \n\t" // Scale by beta " fmul v5.2d,v5.2d,v7.d[0] \n\t" // Scale by beta " \n\t" -" .DBETAZEROCOLSTOREDS1: \n\t" +LABEL(DBETAZEROCOLSTOREDS1) " \n\t" " fmla v0.2d,v8.2d,v6.d[0] \n\t" // Scale by alpha " fmla v1.2d,v9.2d,v6.d[0] \n\t" // Scale by alpha @@ -1667,7 +1668,7 @@ __asm__ volatile " dup v13.2d, xzr \n\t" " \n\t" " fcmp d7,#0.0 \n\t" -" beq .DBETAZEROCOLSTOREDS2 \n\t" // Taking care of the beta==0 case. +BEQ(DBETAZEROCOLSTOREDS2) // Taking care of the beta==0 case. " \n\t" " ldr q8, [x21] \n\t" //Load column 2 of C " ldr q9, [x21, #16] \n\t" @@ -1684,7 +1685,7 @@ __asm__ volatile " fmul v12.2d,v12.2d,v7.d[0] \n\t" // Scale by beta " fmul v13.2d,v13.2d,v7.d[0] \n\t" // Scale by beta " \n\t" -" .DBETAZEROCOLSTOREDS2: \n\t" +LABEL(DBETAZEROCOLSTOREDS2) " \n\t" " fmla v8.2d, v14.2d,v6.d[0] \n\t" // Scale by alpha " fmla v9.2d, v15.2d,v6.d[0] \n\t" // Scale by alpha @@ -1709,7 +1710,7 @@ __asm__ volatile " dup v5.2d, xzr \n\t" " \n\t" " fcmp d7,#0.0 \n\t" -" beq .DBETAZEROCOLSTOREDS3 \n\t" // Taking care of the beta==0 case. +BEQ(DBETAZEROCOLSTOREDS3) // Taking care of the beta==0 case. " \n\t" " ldr q0, [x23] \n\t" //Load column 4 of C " ldr q1, [x23, #16] \n\t" @@ -1726,7 +1727,7 @@ __asm__ volatile " fmul v4.2d,v4.2d,v7.d[0] \n\t" // Scale by beta " fmul v5.2d,v5.2d,v7.d[0] \n\t" // Scale by beta " \n\t" -" .DBETAZEROCOLSTOREDS3: \n\t" +LABEL(DBETAZEROCOLSTOREDS3) " \n\t" " fmla v0.2d,v20.2d,v6.d[0] \n\t" // Scale by alpha " fmla v1.2d,v21.2d,v6.d[0] \n\t" // Scale by alpha @@ -1751,7 +1752,7 @@ __asm__ volatile " dup v13.2d, xzr \n\t" " \n\t" " fcmp d7,#0.0 \n\t" -" beq .DBETAZEROCOLSTOREDS4 \n\t" // Taking care of the beta==0 case. +BEQ(DBETAZEROCOLSTOREDS4) // Taking care of the beta==0 case. " \n\t" " ldr q8, [x25] \n\t" //Load column 6 of C " ldr q9, [x25, #16] \n\t" @@ -1768,7 +1769,7 @@ __asm__ volatile " fmul v12.2d,v12.2d,v7.d[0] \n\t" // Scale by beta " fmul v13.2d,v13.2d,v7.d[0] \n\t" // Scale by beta " \n\t" -" .DBETAZEROCOLSTOREDS4: \n\t" +LABEL(DBETAZEROCOLSTOREDS4) " \n\t" " prfm pldl2keep,[x3] \n\t" " prfm pldl2keep,[x4] \n\t" @@ -1788,9 +1789,9 @@ __asm__ volatile " str q12, [x26, #16] \n\t" " str q13, [x26, #32] \n\t" " \n\t" -" b .DEND \n\t" +BRANCH(DEND) " \n\t" -" .DGENSTORED: \n\t" // C is general-stride stored. +LABEL(DGENSTORED) // C is general-stride stored. " \n\t" " dup v0.2d, xzr \n\t" " dup v1.2d, xzr \n\t" @@ -1800,7 +1801,7 @@ __asm__ volatile " dup v5.2d, xzr \n\t" " \n\t" " fcmp d7,#0.0 \n\t" -" beq .DBETAZEROGENSTOREDS1 \n\t" // Taking care of the beta==0 case. +BEQ(DBETAZEROGENSTOREDS1) // Taking care of the beta==0 case. " \n\t" " mov x27, x2 \n\t" " \n\t" // Load address of C. @@ -1827,7 +1828,7 @@ __asm__ volatile " fmul v4.2d,v4.2d,v7.d[0] \n\t" // Scale by beta " fmul v5.2d,v5.2d,v7.d[0] \n\t" // Scale by beta " \n\t" -" .DBETAZEROGENSTOREDS1: \n\t" +LABEL(DBETAZEROGENSTOREDS1) " \n\t" " fmla v0.2d,v8.2d,v6.d[0] \n\t" // Scale by alpha " fmla v1.2d,v9.2d,v6.d[0] \n\t" // Scale by alpha @@ -1862,7 +1863,7 @@ __asm__ volatile " dup v13.2d, xzr \n\t" " \n\t" " fcmp d7,#0.0 \n\t" -" beq .DBETAZEROGENSTOREDS2 \n\t" // Taking care of the beta==0 case. +BEQ(DBETAZEROGENSTOREDS2) // Taking care of the beta==0 case. " \n\t" " mov x27, x21 \n\t" // Load address of C. " \n\t" @@ -1889,7 +1890,7 @@ __asm__ volatile " fmul v12.2d,v12.2d,v7.d[0] \n\t" // Scale by beta " fmul v13.2d,v13.2d,v7.d[0] \n\t" // Scale by beta " \n\t" -" .DBETAZEROGENSTOREDS2: \n\t" +LABEL(DBETAZEROGENSTOREDS2) " \n\t" " fmla v8.2d, v14.2d,v6.d[0] \n\t" // Scale by alpha " fmla v9.2d, v15.2d,v6.d[0] \n\t" // Scale by alpha @@ -1924,7 +1925,7 @@ __asm__ volatile " dup v5.2d, xzr \n\t" " \n\t" " fcmp d7,#0.0 \n\t" -" beq .DBETAZEROGENSTOREDS3 \n\t" // Taking care of the beta==0 case. +BEQ(DBETAZEROGENSTOREDS3) // Taking care of the beta==0 case. " \n\t" " mov x27, x23 \n\t" // Load address of C. " \n\t" @@ -1951,7 +1952,7 @@ __asm__ volatile " fmul v4.2d,v4.2d,v7.d[0] \n\t" // Scale by beta " fmul v5.2d,v5.2d,v7.d[0] \n\t" // Scale by beta " \n\t" -" .DBETAZEROGENSTOREDS3: \n\t" +LABEL(DBETAZEROGENSTOREDS3) " \n\t" " fmla v0.2d,v20.2d,v6.d[0] \n\t" // Scale by alpha " fmla v1.2d,v21.2d,v6.d[0] \n\t" // Scale by alpha @@ -1986,7 +1987,7 @@ __asm__ volatile " dup v13.2d, xzr \n\t" " \n\t" " fcmp d7,#0.0 \n\t" -" beq .DBETAZEROGENSTOREDS4 \n\t" // Taking care of the beta==0 case. +BEQ(DBETAZEROGENSTOREDS4) // Taking care of the beta==0 case. " \n\t" " mov x27, x25 \n\t" " \n\t" @@ -2013,7 +2014,7 @@ __asm__ volatile " fmul v12.2d,v12.2d,v7.d[0] \n\t" // Scale by beta " fmul v13.2d,v13.2d,v7.d[0] \n\t" // Scale by beta " \n\t" -" .DBETAZEROGENSTOREDS4: \n\t" +LABEL(DBETAZEROGENSTOREDS4) " \n\t" " prfm pldl2keep,[x3] \n\t" " prfm pldl2keep,[x4] \n\t" @@ -2043,7 +2044,7 @@ __asm__ volatile " st1 {v13.d}[0],[x27],x14 \n\t" // Store c74 into quad and increment by rs_c. " st1 {v13.d}[1],[x27],x14 \n\t" // Store c75 into quad and increment by rs_c. " \n\t" -" .DEND: \n\t" // Done! +LABEL(DEND) // Done! " \n\t" :// output operands (none) :// input operands From 916e1fa8be3cea0e3e2a4a7e8b00027ac2ee7780 Mon Sep 17 00:00:00 2001 From: RuQing Xu Date: Sat, 29 May 2021 16:46:52 +0900 Subject: [PATCH 2/4] Armv8A Rename Regs for Clang Compile: FP64 Part - x7, x8: Used to store address for Alpha and Beta. As Alpha & Beta was not used in k-loops, use x0, x1 to load Alpha & Beta's addresses after k-loops are completed, since A & B's addresses are no longer needed there. This "ldr [addr]; -> ldr val, [addr]" would not cause much performance drawback since it is done outside k-loops and there are plenty of instructions between Alpha & Beta's loading and usage. - x9: Used to store cs_c. x9 is multiplied by 8 into x10 and not used any longer. Directly loading cs_c and into x10 and scale by 8 spares x9 straightforwardly. - x11, x12: Not used at all. Simply remove from clobber list. - x13: Alike x9, loaded and scaled by 8 into x14, except that x13 is also used in a conditional branch so that "cmp x13, #1" needs to be modified into "cmp x14, #8" to completely free x13. - x3, x4: Used to store next_a & next_b. Untouched in k-loops. Load these addresses into x0 and x1 after Alpha & Beta are both loaded, since then neigher address of A/B nor address of Alpha/Beta is needed. --- kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c | 44 ++++++++++----------- 1 file changed, 21 insertions(+), 23 deletions(-) diff --git a/kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c b/kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c index 251931f7c..279b61b79 100644 --- a/kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c +++ b/kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c @@ -1135,20 +1135,14 @@ __asm__ volatile " ldr x1,%[baddr] \n\t" // Load address of B " ldr x2,%[caddr] \n\t" // Load address of C " \n\t" -" ldr x3,%[a_next] \n\t" // Move pointer -" ldr x4,%[b_next] \n\t" // Move pointer -" \n\t" " ldr x5,%[k_iter] \n\t" // Init guard (k_iter) " ldr x6,%[k_left] \n\t" // Init guard (k_iter) " \n\t" -" ldr x7,%[alpha] \n\t" // Alpha address -" ldr x8,%[beta] \n\t" // Beta address -" \n\t" -" ldr x9,%[cs_c] \n\t" // Load cs_c -" lsl x10,x9,#3 \n\t" // cs_c * sizeof(double) +" ldr x10,%[cs_c] \n\t" // Load cs_c +" lsl x10,x10,#3 \n\t" // cs_c * sizeof(double) " \n\t" -" ldr x13,%[rs_c] \n\t" // Load rs_c. -" lsl x14,x13,#3 \n\t" // rs_c * sizeof(double). +" ldr x14,%[rs_c] \n\t" // Load rs_c. +" lsl x14,x14,#3 \n\t" // rs_c * sizeof(double). " \n\t" " add x20,x2,x10 \n\t" //Load address Column 1 of C " add x21,x20,x10 \n\t" //Load address Column 2 of C @@ -1610,10 +1604,16 @@ BNE(DLOOPKLEFT) // if i!=0. " \n\t" LABEL(DPOSTACCUM) " \n\t" -" ld1r {v6.2d},[x7] \n\t" // Load alpha. -" ld1r {v7.2d},[x8] \n\t" // Load beta +" ldr x0,%[alpha] \n\t" // Alpha address +" ldr x1,%[beta] \n\t" // Beta address +" \n\t" +" ld1r {v6.2d},[x0] \n\t" // Load alpha. +" ld1r {v7.2d},[x1] \n\t" // Load beta " \n\t" -" cmp x13,#1 \n\t" // If rs_c != 1 (column-major) +" ldr x0,%[a_next] \n\t" // Next A address for later use. +" ldr x1,%[b_next] \n\t" // Next B address for later use. +" \n\t" +" cmp x14,#8 \n\t" // If rs_c != 1 (column-major) BNE(DGENSTORED) " \n\t" LABEL(DCOLSTORED) // C is column-major. @@ -1771,8 +1771,8 @@ BEQ(DBETAZEROCOLSTOREDS4) // Taking care of the beta==0 " \n\t" LABEL(DBETAZEROCOLSTOREDS4) " \n\t" -" prfm pldl2keep,[x3] \n\t" -" prfm pldl2keep,[x4] \n\t" +" prfm pldl2keep,[x0] \n\t" +" prfm pldl2keep,[x1] \n\t" " \n\t" " fmla v8.2d, v26.2d,v6.d[0] \n\t" // Scale by alpha " fmla v9.2d, v27.2d,v6.d[0] \n\t" // Scale by alpha @@ -2016,8 +2016,8 @@ BEQ(DBETAZEROGENSTOREDS4) // Taking care of the beta==0 " \n\t" LABEL(DBETAZEROGENSTOREDS4) " \n\t" -" prfm pldl2keep,[x3] \n\t" -" prfm pldl2keep,[x4] \n\t" +" prfm pldl2keep,[x0] \n\t" +" prfm pldl2keep,[x1] \n\t" " \n\t" " fmla v8.2d, v26.2d,v6.d[0] \n\t" // Scale by alpha " fmla v9.2d, v27.2d,v6.d[0] \n\t" // Scale by alpha @@ -2060,12 +2060,10 @@ LABEL(DEND) // Done! [a_next] "m" (a_next), // 8 [b_next] "m" (b_next) // 9 :// Register clobber list - "x0","x1","x2","x3", - "x4","x5","x6", - "x7","x8","x9", - "x10","x11","x12","x13","x14","x16","x17", - "x20","x21","x22","x23","x24","x25","x26", - "x27", + "x0","x1","x2", + "x5","x6","x10", + "x14","x16","x17", + "x20","x21","x22","x23","x24","x25","x26","x27", "v0","v1","v2", "v3","v4","v5", "v6","v7","v8", From 9f4a4a3cfb2244e4024445e127dafd2a11f39fc5 Mon Sep 17 00:00:00 2001 From: RuQing Xu Date: Sat, 29 May 2021 17:21:28 +0900 Subject: [PATCH 3/4] Armv8A Rename Regs for Clang Compile: FP32 Part Roughly the same as 916e1fa , additionally with x15 clobbering removed. - x15: Not used at all. Compilation w/ Clang shows warning about x18 reservation, but compilation itself is OK and all tests got passed. --- kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c | 41 ++++++++++----------- 1 file changed, 20 insertions(+), 21 deletions(-) diff --git a/kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c b/kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c index 279b61b79..be5e20ae7 100644 --- a/kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c +++ b/kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c @@ -82,20 +82,14 @@ __asm__ volatile " ldr x1,%[baddr] \n\t" // Load address of B. " ldr x2,%[caddr] \n\t" // Load address of C. " \n\t" -" ldr x3,%[a_next] \n\t" // Pointer to next block of A. -" ldr x4,%[b_next] \n\t" // Pointer to next pointer of B. -" \n\t" " ldr x5,%[k_iter] \n\t" // Number of unrolled iterations (k_iter). " ldr x6,%[k_left] \n\t" // Number of remaining iterations (k_left). " \n\t" -" ldr x7,%[alpha] \n\t" // Alpha address. -" ldr x8,%[beta] \n\t" // Beta address. +" ldr x10,%[cs_c] \n\t" // Load cs_c. +" lsl x10,x10,#2 \n\t" // cs_c * sizeof(float) -- AUX. " \n\t" -" ldr x9,%[cs_c] \n\t" // Load cs_c. -" lsl x10,x9,#2 \n\t" // cs_c * sizeof(float) -- AUX. -" \n\t" -" ldr x13,%[rs_c] \n\t" // Load rs_c. -" lsl x14,x13,#2 \n\t" // rs_c * sizeof(float). +" ldr x14,%[rs_c] \n\t" // Load rs_c. +" lsl x14,x14,#2 \n\t" // rs_c * sizeof(float). " \n\t" " add x16,x2,x10 \n\t" //Load address Column 1 of C " add x17,x16,x10 \n\t" //Load address Column 2 of C @@ -502,10 +496,16 @@ BNE(SLOOPKLEFT) // if i!=0. " \n\t" LABEL(SPOSTACCUM) " \n\t" -" ld1r {v6.4s},[x7] \n\t" // Load alpha. -" ld1r {v7.4s},[x8] \n\t" // Load beta +" ldr x0,%[alpha] \n\t" // Alpha address. +" ldr x1,%[beta] \n\t" // Beta address. " \n\t" -" cmp x13,#1 \n\t" // If rs_c != 1 (column-major) +" ld1r {v6.4s},[x0] \n\t" // Load alpha. +" ld1r {v7.4s},[x1] \n\t" // Load beta +" \n\t" +" ldr x0,%[a_next] \n\t" // Pointer to next block of A. +" ldr x1,%[b_next] \n\t" // Pointer to next pointer of B. +" \n\t" +" cmp x14,#4 \n\t" // If rs_c != 1 (column-major) BNE(SGENSTORED) " \n\t" LABEL(SCOLSTORED) // C is column-major. @@ -656,8 +656,8 @@ BEQ(SBETAZEROCOLSTOREDS4) // Taking care of the beta==0 " \n\t" LABEL(SBETAZEROCOLSTOREDS4) " \n\t" -" prfm pldl2keep,[x3] \n\t" -" prfm pldl2keep,[x4] \n\t" +" prfm pldl2keep,[x0] \n\t" +" prfm pldl2keep,[x1] \n\t" " \n\t" " fmla v8.4s, v26.4s,v6.s[0] \n\t" // Scale by alpha " fmla v9.4s, v27.4s,v6.s[0] \n\t" // Scale by alpha @@ -1008,8 +1008,8 @@ BEQ(SBETAZEROGENSTOREDS4) // Taking care of the beta==0 " \n\t" LABEL(SBETAZEROGENSTOREDS4) " \n\t" -" prfm pldl2keep,[x3] \n\t" -" prfm pldl2keep,[x4] \n\t" +" prfm pldl2keep,[x0] \n\t" +" prfm pldl2keep,[x1] \n\t" " \n\t" " fmla v8.4s, v26.4s,v6.s[0] \n\t" // Scale by alpha " fmla v9.4s, v27.4s,v6.s[0] \n\t" // Scale by alpha @@ -1067,10 +1067,9 @@ LABEL(SEND) // Done! [a_next] "m" (a_next), // 9 [b_next] "m" (b_next) // 10 :// Register clobber list - "x0", "x1", "x2","x3","x4", - "x5", "x6", "x7", "x8", - "x9", "x10","x11","x12", - "x13","x14","x15", + "x0", "x1", "x2", + "x5", "x6", "x10", + "x14", "x16","x17","x18","x19", "x20","x21","x22","x23", "x24","x25","x26","x27", From 5fc93e280614b4a21a9cff36cf873b4b9407285b Mon Sep 17 00:00:00 2001 From: RuQing Xu Date: Sat, 29 May 2021 18:44:47 +0900 Subject: [PATCH 4/4] Armv8A Rename Regs for Safe Darwin Compile Avoid x18 use in FP32 kernel: - C address lines x[18-26] renamed to x[19-27] (reg index +1) - Original role of x27 fulfilled by x5 which is free after k-loop pert. FP64 does not require changing since x18 is not used there. --- kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c | 533 ++++++++++---------- 1 file changed, 266 insertions(+), 267 deletions(-) diff --git a/kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c b/kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c index be5e20ae7..dfdda863b 100644 --- a/kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c +++ b/kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c @@ -93,20 +93,19 @@ __asm__ volatile " \n\t" " add x16,x2,x10 \n\t" //Load address Column 1 of C " add x17,x16,x10 \n\t" //Load address Column 2 of C -" add x18,x17,x10 \n\t" //Load address Column 3 of C -" add x19,x18,x10 \n\t" //Load address Column 4 of C -" add x20,x19,x10 \n\t" //Load address Column 5 of C -" add x21,x20,x10 \n\t" //Load address Column 6 of C -" add x22,x21,x10 \n\t" //Load address Column 7 of C -" add x23,x22,x10 \n\t" //Load address Column 8 of C -" add x24,x23,x10 \n\t" //Load address Column 9 of C -" add x25,x24,x10 \n\t" //Load address Column 10 of C -" add x26,x25,x10 \n\t" //Load address Column 11 of C +" add x19,x17,x10 \n\t" //Load address Column 3 of C +" add x20,x19,x10 \n\t" //Load address Column 4 of C +" add x21,x20,x10 \n\t" //Load address Column 5 of C +" add x22,x21,x10 \n\t" //Load address Column 6 of C +" add x23,x22,x10 \n\t" //Load address Column 7 of C +" add x24,x23,x10 \n\t" //Load address Column 8 of C +" add x25,x24,x10 \n\t" //Load address Column 9 of C +" add x26,x25,x10 \n\t" //Load address Column 10 of C +" add x27,x26,x10 \n\t" //Load address Column 11 of C " \n\t" " prfm pldl1keep,[x2] \n\t" // Prefetch c. " prfm pldl1keep,[x16] \n\t" // Prefetch c. " prfm pldl1keep,[x17] \n\t" // Prefetch c. -" prfm pldl1keep,[x18] \n\t" // Prefetch c. " prfm pldl1keep,[x19] \n\t" // Prefetch c. " prfm pldl1keep,[x20] \n\t" // Prefetch c. " prfm pldl1keep,[x21] \n\t" // Prefetch c. @@ -115,6 +114,7 @@ __asm__ volatile " prfm pldl1keep,[x24] \n\t" // Prefetch c. " prfm pldl1keep,[x25] \n\t" // Prefetch c. " prfm pldl1keep,[x26] \n\t" // Prefetch c. +" prfm pldl1keep,[x27] \n\t" // Prefetch c. " \n\t" " dup v8.4s, wzr \n\t" // Vector for accummulating column 0 " prfm PLDL1KEEP, [x1, #192] \n\t" @@ -560,12 +560,12 @@ LABEL(SBETAZEROCOLSTOREDS1) " fcmp s7,#0.0 \n\t" BEQ(SBETAZEROCOLSTOREDS2) // Taking care of the beta==0 case. " \n\t" -" ldr q8, [x18] \n\t" //Load column 3 of C -" ldr q9, [x18, #16] \n\t" -" ldr q10, [x19] \n\t" //Load column 4 of C -" ldr q11, [x19, #16] \n\t" -" ldr q12, [x20] \n\t" //Load column 5 of C -" ldr q13, [x20, #16] \n\t" +" ldr q8, [x19] \n\t" //Load column 3 of C +" ldr q9, [x19, #16] \n\t" +" ldr q10, [x20] \n\t" //Load column 4 of C +" ldr q11, [x20, #16] \n\t" +" ldr q12, [x21] \n\t" //Load column 5 of C +" ldr q13, [x21, #16] \n\t" " \n\t" " fmul v8.4s, v8.4s, v7.s[0] \n\t" // Scale by beta " fmul v9.4s, v9.4s, v7.s[0] \n\t" // Scale by beta @@ -583,12 +583,12 @@ LABEL(SBETAZEROCOLSTOREDS2) " fmla v12.4s,v18.4s,v6.s[0] \n\t" // Scale by alpha " fmla v13.4s,v19.4s,v6.s[0] \n\t" // Scale by alpha " \n\t" -" str q8, [x18] \n\t" //Store column 3 of C -" str q9, [x18, #16] \n\t" -" str q10, [x19] \n\t" //Store column 4 of C -" str q11, [x19, #16] \n\t" -" str q12, [x20] \n\t" //Store column 5 of C -" str q13, [x20, #16] \n\t" +" str q8, [x19] \n\t" //Store column 3 of C +" str q9, [x19, #16] \n\t" +" str q10, [x20] \n\t" //Store column 4 of C +" str q11, [x20, #16] \n\t" +" str q12, [x21] \n\t" //Store column 5 of C +" str q13, [x21, #16] \n\t" " \n\t" " dup v0.4s, wzr \n\t" " dup v1.4s, wzr \n\t" @@ -600,12 +600,12 @@ LABEL(SBETAZEROCOLSTOREDS2) " fcmp s7,#0.0 \n\t" BEQ(SBETAZEROCOLSTOREDS3) // Taking care of the beta==0 case. " \n\t" -" ldr q0, [x21] \n\t" //Load column 6 of C -" ldr q1, [x21, #16] \n\t" -" ldr q2, [x22] \n\t" //Load column 7 of C -" ldr q3, [x22, #16] \n\t" -" ldr q4, [x23] \n\t" //Load column 8 of C -" ldr q5, [x23, #16] \n\t" +" ldr q0, [x22] \n\t" //Load column 6 of C +" ldr q1, [x22, #16] \n\t" +" ldr q2, [x23] \n\t" //Load column 7 of C +" ldr q3, [x23, #16] \n\t" +" ldr q4, [x24] \n\t" //Load column 8 of C +" ldr q5, [x24, #16] \n\t" " \n\t" " fmul v0.4s,v0.4s,v7.s[0] \n\t" // Scale by beta " fmul v1.4s,v1.4s,v7.s[0] \n\t" // Scale by beta @@ -623,12 +623,12 @@ LABEL(SBETAZEROCOLSTOREDS3) " fmla v4.4s,v24.4s,v6.s[0] \n\t" // Scale by alpha " fmla v5.4s,v25.4s,v6.s[0] \n\t" // Scale by alpha " \n\t" -" str q0, [x21] \n\t" //Store column 6 of C -" str q1, [x21, #16] \n\t" -" str q2, [x22] \n\t" //Store column 7 of C -" str q3, [x22, #16] \n\t" -" str q4, [x23] \n\t" //Store column 8 of C -" str q5, [x23, #16] \n\t" +" str q0, [x22] \n\t" //Store column 6 of C +" str q1, [x22, #16] \n\t" +" str q2, [x23] \n\t" //Store column 7 of C +" str q3, [x23, #16] \n\t" +" str q4, [x24] \n\t" //Store column 8 of C +" str q5, [x24, #16] \n\t" " \n\t" " dup v8.4s, wzr \n\t" " dup v9.4s, wzr \n\t" @@ -640,12 +640,12 @@ LABEL(SBETAZEROCOLSTOREDS3) " fcmp s7,#0.0 \n\t" BEQ(SBETAZEROCOLSTOREDS4) // Taking care of the beta==0 case. " \n\t" -" ldr q8, [x24] \n\t" //Load column 9 of C -" ldr q9, [x24, #16] \n\t" -" ldr q10, [x25] \n\t" //Load column 10 of C -" ldr q11, [x25, #16] \n\t" -" ldr q12, [x26] \n\t" //Load column 11 of C -" ldr q13, [x26, #16] \n\t" +" ldr q8, [x25] \n\t" //Load column 9 of C +" ldr q9, [x25, #16] \n\t" +" ldr q10, [x26] \n\t" //Load column 10 of C +" ldr q11, [x26, #16] \n\t" +" ldr q12, [x27] \n\t" //Load column 11 of C +" ldr q13, [x27, #16] \n\t" " \n\t" " fmul v8.4s, v8.4s, v7.s[0] \n\t" // Scale by beta " fmul v9.4s, v9.4s, v7.s[0] \n\t" // Scale by beta @@ -666,12 +666,12 @@ LABEL(SBETAZEROCOLSTOREDS4) " fmla v12.4s,v30.4s,v6.s[0] \n\t" // Scale by alpha " fmla v13.4s,v31.4s,v6.s[0] \n\t" // Scale by alpha " \n\t" -" str q8, [x24] \n\t" //Store column 9 of C -" str q9, [x24, #16] \n\t" -" str q10, [x25] \n\t" //Store column 10 of C -" str q11, [x25, #16] \n\t" -" str q12, [x26] \n\t" //Store column 11 of C -" str q13, [x26, #16] \n\t" +" str q8, [x25] \n\t" //Store column 9 of C +" str q9, [x25, #16] \n\t" +" str q10, [x26] \n\t" //Store column 10 of C +" str q11, [x26, #16] \n\t" +" str q12, [x27] \n\t" //Store column 11 of C +" str q13, [x27, #16] \n\t" " \n\t" " \n\t" BRANCH(SEND) // Done. @@ -690,38 +690,38 @@ LABEL(SGENSTORED) // C is general-stride stored " fcmp s7,#0.0 \n\t" BEQ(SBETAZEROGENSTOREDS1) // Taking care of the beta==0 case. " \n\t" -" mov x27, x2 \n\t" +" mov x5, x2 \n\t" " \n\t" -" ld1 {v0.s}[0],[x27],x14 \n\t" // Load c00 into quad and increment by rs_c. -" ld1 {v0.s}[1],[x27],x14 \n\t" // Load c01 into quad and increment by rs_c. -" ld1 {v0.s}[2],[x27],x14 \n\t" // Load c02 into quad and increment by rs_c. -" ld1 {v0.s}[3],[x27],x14 \n\t" // Load c03 into quad and increment by rs_c. -" ld1 {v1.s}[0],[x27],x14 \n\t" // Load c04 into quad and increment by rs_c. -" ld1 {v1.s}[1],[x27],x14 \n\t" // Load c05 into quad and increment by rs_c. -" ld1 {v1.s}[2],[x27],x14 \n\t" // Load c06 into quad and increment by rs_c. -" ld1 {v1.s}[3],[x27],x14 \n\t" // Load c07 into quad and increment by rs_c. +" ld1 {v0.s}[0],[x5],x14 \n\t" // Load c00 into quad and increment by rs_c. +" ld1 {v0.s}[1],[x5],x14 \n\t" // Load c01 into quad and increment by rs_c. +" ld1 {v0.s}[2],[x5],x14 \n\t" // Load c02 into quad and increment by rs_c. +" ld1 {v0.s}[3],[x5],x14 \n\t" // Load c03 into quad and increment by rs_c. +" ld1 {v1.s}[0],[x5],x14 \n\t" // Load c04 into quad and increment by rs_c. +" ld1 {v1.s}[1],[x5],x14 \n\t" // Load c05 into quad and increment by rs_c. +" ld1 {v1.s}[2],[x5],x14 \n\t" // Load c06 into quad and increment by rs_c. +" ld1 {v1.s}[3],[x5],x14 \n\t" // Load c07 into quad and increment by rs_c. " \n\t" -" mov x27, x16 \n\t" +" mov x5, x16 \n\t" " \n\t" -" ld1 {v2.s}[0],[x27],x14 \n\t" // Load c10 into quad and increment by rs_c. -" ld1 {v2.s}[1],[x27],x14 \n\t" // Load c11 into quad and increment by rs_c. -" ld1 {v2.s}[2],[x27],x14 \n\t" // Load c12 into quad and increment by rs_c. -" ld1 {v2.s}[3],[x27],x14 \n\t" // Load c13 into quad and increment by rs_c. -" ld1 {v3.s}[0],[x27],x14 \n\t" // Load c14 into quad and increment by rs_c. -" ld1 {v3.s}[1],[x27],x14 \n\t" // Load c15 into quad and increment by rs_c. -" ld1 {v3.s}[2],[x27],x14 \n\t" // Load c16 into quad and increment by rs_c. -" ld1 {v3.s}[3],[x27],x14 \n\t" // Load c17 into quad and increment by rs_c. +" ld1 {v2.s}[0],[x5],x14 \n\t" // Load c10 into quad and increment by rs_c. +" ld1 {v2.s}[1],[x5],x14 \n\t" // Load c11 into quad and increment by rs_c. +" ld1 {v2.s}[2],[x5],x14 \n\t" // Load c12 into quad and increment by rs_c. +" ld1 {v2.s}[3],[x5],x14 \n\t" // Load c13 into quad and increment by rs_c. +" ld1 {v3.s}[0],[x5],x14 \n\t" // Load c14 into quad and increment by rs_c. +" ld1 {v3.s}[1],[x5],x14 \n\t" // Load c15 into quad and increment by rs_c. +" ld1 {v3.s}[2],[x5],x14 \n\t" // Load c16 into quad and increment by rs_c. +" ld1 {v3.s}[3],[x5],x14 \n\t" // Load c17 into quad and increment by rs_c. " \n\t" -" mov x27, x17 \n\t" +" mov x5, x17 \n\t" " \n\t" -" ld1 {v4.s}[0],[x27],x14 \n\t" // Load c20 into quad and increment by rs_c. -" ld1 {v4.s}[1],[x27],x14 \n\t" // Load c21 into quad and increment by rs_c. -" ld1 {v4.s}[2],[x27],x14 \n\t" // Load c22 into quad and increment by rs_c. -" ld1 {v4.s}[3],[x27],x14 \n\t" // Load c23 into quad and increment by rs_c. -" ld1 {v5.s}[0],[x27],x14 \n\t" // Load c24 into quad and increment by rs_c. -" ld1 {v5.s}[1],[x27],x14 \n\t" // Load c25 into quad and increment by rs_c. -" ld1 {v5.s}[2],[x27],x14 \n\t" // Load c26 into quad and increment by rs_c. -" ld1 {v5.s}[3],[x27],x14 \n\t" // Load c27 into quad and increment by rs_c. +" ld1 {v4.s}[0],[x5],x14 \n\t" // Load c20 into quad and increment by rs_c. +" ld1 {v4.s}[1],[x5],x14 \n\t" // Load c21 into quad and increment by rs_c. +" ld1 {v4.s}[2],[x5],x14 \n\t" // Load c22 into quad and increment by rs_c. +" ld1 {v4.s}[3],[x5],x14 \n\t" // Load c23 into quad and increment by rs_c. +" ld1 {v5.s}[0],[x5],x14 \n\t" // Load c24 into quad and increment by rs_c. +" ld1 {v5.s}[1],[x5],x14 \n\t" // Load c25 into quad and increment by rs_c. +" ld1 {v5.s}[2],[x5],x14 \n\t" // Load c26 into quad and increment by rs_c. +" ld1 {v5.s}[3],[x5],x14 \n\t" // Load c27 into quad and increment by rs_c. " \n\t" " fmul v0.4s,v0.4s,v7.s[0] \n\t" // Scale by beta " fmul v1.4s,v1.4s,v7.s[0] \n\t" // Scale by beta @@ -739,38 +739,38 @@ LABEL(SBETAZEROGENSTOREDS1) " fmla v4.4s,v12.4s,v6.s[0] \n\t" // Scale by alpha " fmla v5.4s,v13.4s,v6.s[0] \n\t" // Scale by alpha " \n\t" -" mov x27, x2 \n\t" +" mov x5, x2 \n\t" " \n\t" -" st1 {v0.s}[0],[x27],x14 \n\t" // Store c00 into quad and increment by rs_c. -" st1 {v0.s}[1],[x27],x14 \n\t" // Store c01 into quad and increment by rs_c. -" st1 {v0.s}[2],[x27],x14 \n\t" // Store c02 into quad and increment by rs_c. -" st1 {v0.s}[3],[x27],x14 \n\t" // Store c03 into quad and increment by rs_c. -" st1 {v1.s}[0],[x27],x14 \n\t" // Store c04 into quad and increment by rs_c. -" st1 {v1.s}[1],[x27],x14 \n\t" // Store c05 into quad and increment by rs_c. -" st1 {v1.s}[2],[x27],x14 \n\t" // Store c06 into quad and increment by rs_c. -" st1 {v1.s}[3],[x27],x14 \n\t" // Store c07 into quad and increment by rs_c. +" st1 {v0.s}[0],[x5],x14 \n\t" // Store c00 into quad and increment by rs_c. +" st1 {v0.s}[1],[x5],x14 \n\t" // Store c01 into quad and increment by rs_c. +" st1 {v0.s}[2],[x5],x14 \n\t" // Store c02 into quad and increment by rs_c. +" st1 {v0.s}[3],[x5],x14 \n\t" // Store c03 into quad and increment by rs_c. +" st1 {v1.s}[0],[x5],x14 \n\t" // Store c04 into quad and increment by rs_c. +" st1 {v1.s}[1],[x5],x14 \n\t" // Store c05 into quad and increment by rs_c. +" st1 {v1.s}[2],[x5],x14 \n\t" // Store c06 into quad and increment by rs_c. +" st1 {v1.s}[3],[x5],x14 \n\t" // Store c07 into quad and increment by rs_c. " \n\t" -" mov x27, x16 \n\t" +" mov x5, x16 \n\t" " \n\t" -" st1 {v2.s}[0],[x27],x14 \n\t" // Store c10 into quad and increment by rs_c. -" st1 {v2.s}[1],[x27],x14 \n\t" // Store c11 into quad and increment by rs_c. -" st1 {v2.s}[2],[x27],x14 \n\t" // Store c12 into quad and increment by rs_c. -" st1 {v2.s}[3],[x27],x14 \n\t" // Store c13 into quad and increment by rs_c. -" st1 {v3.s}[0],[x27],x14 \n\t" // Store c14 into quad and increment by rs_c. -" st1 {v3.s}[1],[x27],x14 \n\t" // Store c15 into quad and increment by rs_c. -" st1 {v3.s}[2],[x27],x14 \n\t" // Store c16 into quad and increment by rs_c. -" st1 {v3.s}[3],[x27],x14 \n\t" // Store c17 into quad and increment by rs_c. +" st1 {v2.s}[0],[x5],x14 \n\t" // Store c10 into quad and increment by rs_c. +" st1 {v2.s}[1],[x5],x14 \n\t" // Store c11 into quad and increment by rs_c. +" st1 {v2.s}[2],[x5],x14 \n\t" // Store c12 into quad and increment by rs_c. +" st1 {v2.s}[3],[x5],x14 \n\t" // Store c13 into quad and increment by rs_c. +" st1 {v3.s}[0],[x5],x14 \n\t" // Store c14 into quad and increment by rs_c. +" st1 {v3.s}[1],[x5],x14 \n\t" // Store c15 into quad and increment by rs_c. +" st1 {v3.s}[2],[x5],x14 \n\t" // Store c16 into quad and increment by rs_c. +" st1 {v3.s}[3],[x5],x14 \n\t" // Store c17 into quad and increment by rs_c. " \n\t" -" mov x27, x17 \n\t" +" mov x5, x17 \n\t" " \n\t" -" st1 {v4.s}[0],[x27],x14 \n\t" // Store c20 into quad and increment by rs_c. -" st1 {v4.s}[1],[x27],x14 \n\t" // Store c21 into quad and increment by rs_c. -" st1 {v4.s}[2],[x27],x14 \n\t" // Store c22 into quad and increment by rs_c. -" st1 {v4.s}[3],[x27],x14 \n\t" // Store c23 into quad and increment by rs_c. -" st1 {v5.s}[0],[x27],x14 \n\t" // Store c24 into quad and increment by rs_c. -" st1 {v5.s}[1],[x27],x14 \n\t" // Store c25 into quad and increment by rs_c. -" st1 {v5.s}[2],[x27],x14 \n\t" // Store c26 into quad and increment by rs_c. -" st1 {v5.s}[3],[x27],x14 \n\t" // Store c27 into quad and increment by rs_c. +" st1 {v4.s}[0],[x5],x14 \n\t" // Store c20 into quad and increment by rs_c. +" st1 {v4.s}[1],[x5],x14 \n\t" // Store c21 into quad and increment by rs_c. +" st1 {v4.s}[2],[x5],x14 \n\t" // Store c22 into quad and increment by rs_c. +" st1 {v4.s}[3],[x5],x14 \n\t" // Store c23 into quad and increment by rs_c. +" st1 {v5.s}[0],[x5],x14 \n\t" // Store c24 into quad and increment by rs_c. +" st1 {v5.s}[1],[x5],x14 \n\t" // Store c25 into quad and increment by rs_c. +" st1 {v5.s}[2],[x5],x14 \n\t" // Store c26 into quad and increment by rs_c. +" st1 {v5.s}[3],[x5],x14 \n\t" // Store c27 into quad and increment by rs_c. " \n\t" " dup v8.4s, wzr \n\t" " dup v9.4s, wzr \n\t" @@ -782,38 +782,38 @@ LABEL(SBETAZEROGENSTOREDS1) " fcmp s7,#0.0 \n\t" BEQ(SBETAZEROGENSTOREDS2) // Taking care of the beta==0 case. " \n\t" -" mov x27, x18 \n\t" +" mov x5, x19 \n\t" " \n\t" -" ld1 {v8.s}[0],[x27],x14 \n\t" // Load c30 into quad and increment by rs_c. -" ld1 {v8.s}[1],[x27],x14 \n\t" // Load c31 into quad and increment by rs_c. -" ld1 {v8.s}[2],[x27],x14 \n\t" // Load c32 into quad and increment by rs_c. -" ld1 {v8.s}[3],[x27],x14 \n\t" // Load c33 into quad and increment by rs_c. -" ld1 {v9.s}[0],[x27],x14 \n\t" // Load c34 into quad and increment by rs_c. -" ld1 {v9.s}[1],[x27],x14 \n\t" // Load c35 into quad and increment by rs_c. -" ld1 {v9.s}[2],[x27],x14 \n\t" // Load c36 into quad and increment by rs_c. -" ld1 {v9.s}[3],[x27],x14 \n\t" // Load c37 into quad and increment by rs_c. +" ld1 {v8.s}[0],[x5],x14 \n\t" // Load c30 into quad and increment by rs_c. +" ld1 {v8.s}[1],[x5],x14 \n\t" // Load c31 into quad and increment by rs_c. +" ld1 {v8.s}[2],[x5],x14 \n\t" // Load c32 into quad and increment by rs_c. +" ld1 {v8.s}[3],[x5],x14 \n\t" // Load c33 into quad and increment by rs_c. +" ld1 {v9.s}[0],[x5],x14 \n\t" // Load c34 into quad and increment by rs_c. +" ld1 {v9.s}[1],[x5],x14 \n\t" // Load c35 into quad and increment by rs_c. +" ld1 {v9.s}[2],[x5],x14 \n\t" // Load c36 into quad and increment by rs_c. +" ld1 {v9.s}[3],[x5],x14 \n\t" // Load c37 into quad and increment by rs_c. " \n\t" -" mov x27, x19 \n\t" +" mov x5, x20 \n\t" " \n\t" -" ld1 {v10.s}[0],[x27],x14 \n\t" // Load c40 into quad and increment by rs_c. -" ld1 {v10.s}[1],[x27],x14 \n\t" // Load c41 into quad and increment by rs_c. -" ld1 {v10.s}[2],[x27],x14 \n\t" // Load c42 into quad and increment by rs_c. -" ld1 {v10.s}[3],[x27],x14 \n\t" // Load c43 into quad and increment by rs_c. -" ld1 {v11.s}[0],[x27],x14 \n\t" // Load c44 into quad and increment by rs_c. -" ld1 {v11.s}[1],[x27],x14 \n\t" // Load c45 into quad and increment by rs_c. -" ld1 {v11.s}[2],[x27],x14 \n\t" // Load c46 into quad and increment by rs_c. -" ld1 {v11.s}[3],[x27],x14 \n\t" // Load c47 into quad and increment by rs_c. +" ld1 {v10.s}[0],[x5],x14 \n\t" // Load c40 into quad and increment by rs_c. +" ld1 {v10.s}[1],[x5],x14 \n\t" // Load c41 into quad and increment by rs_c. +" ld1 {v10.s}[2],[x5],x14 \n\t" // Load c42 into quad and increment by rs_c. +" ld1 {v10.s}[3],[x5],x14 \n\t" // Load c43 into quad and increment by rs_c. +" ld1 {v11.s}[0],[x5],x14 \n\t" // Load c44 into quad and increment by rs_c. +" ld1 {v11.s}[1],[x5],x14 \n\t" // Load c45 into quad and increment by rs_c. +" ld1 {v11.s}[2],[x5],x14 \n\t" // Load c46 into quad and increment by rs_c. +" ld1 {v11.s}[3],[x5],x14 \n\t" // Load c47 into quad and increment by rs_c. " \n\t" -" mov x27, x20 \n\t" +" mov x5, x21 \n\t" " \n\t" -" ld1 {v12.s}[0],[x27],x14 \n\t" // Load c50 into quad and increment by rs_c. -" ld1 {v12.s}[1],[x27],x14 \n\t" // Load c51 into quad and increment by rs_c. -" ld1 {v12.s}[2],[x27],x14 \n\t" // Load c52 into quad and increment by rs_c. -" ld1 {v12.s}[3],[x27],x14 \n\t" // Load c53 into quad and increment by rs_c. -" ld1 {v13.s}[0],[x27],x14 \n\t" // Load c54 into quad and increment by rs_c. -" ld1 {v13.s}[1],[x27],x14 \n\t" // Load c55 into quad and increment by rs_c. -" ld1 {v13.s}[2],[x27],x14 \n\t" // Load c56 into quad and increment by rs_c. -" ld1 {v13.s}[3],[x27],x14 \n\t" // Load c57 into quad and increment by rs_c. +" ld1 {v12.s}[0],[x5],x14 \n\t" // Load c50 into quad and increment by rs_c. +" ld1 {v12.s}[1],[x5],x14 \n\t" // Load c51 into quad and increment by rs_c. +" ld1 {v12.s}[2],[x5],x14 \n\t" // Load c52 into quad and increment by rs_c. +" ld1 {v12.s}[3],[x5],x14 \n\t" // Load c53 into quad and increment by rs_c. +" ld1 {v13.s}[0],[x5],x14 \n\t" // Load c54 into quad and increment by rs_c. +" ld1 {v13.s}[1],[x5],x14 \n\t" // Load c55 into quad and increment by rs_c. +" ld1 {v13.s}[2],[x5],x14 \n\t" // Load c56 into quad and increment by rs_c. +" ld1 {v13.s}[3],[x5],x14 \n\t" // Load c57 into quad and increment by rs_c. " \n\t" " fmul v8.4s, v8.4s, v7.s[0] \n\t" // Scale by beta " fmul v9.4s, v9.4s, v7.s[0] \n\t" // Scale by beta @@ -831,38 +831,38 @@ LABEL(SBETAZEROGENSTOREDS2) " fmla v12.4s,v18.4s,v6.s[0] \n\t" // Scale by alpha " fmla v13.4s,v19.4s,v6.s[0] \n\t" // Scale by alpha " \n\t" -" mov x27, x18 \n\t" +" mov x5, x19 \n\t" " \n\t" -" st1 {v8.s}[0],[x27],x14 \n\t" // Store c30 into quad and increment by rs_c. -" st1 {v8.s}[1],[x27],x14 \n\t" // Store c31 into quad and increment by rs_c. -" st1 {v8.s}[2],[x27],x14 \n\t" // Store c32 into quad and increment by rs_c. -" st1 {v8.s}[3],[x27],x14 \n\t" // Store c33 into quad and increment by rs_c. -" st1 {v9.s}[0],[x27],x14 \n\t" // Store c34 into quad and increment by rs_c. -" st1 {v9.s}[1],[x27],x14 \n\t" // Store c35 into quad and increment by rs_c. -" st1 {v9.s}[2],[x27],x14 \n\t" // Store c36 into quad and increment by rs_c. -" st1 {v9.s}[3],[x27],x14 \n\t" // Store c37 into quad and increment by rs_c. +" st1 {v8.s}[0],[x5],x14 \n\t" // Store c30 into quad and increment by rs_c. +" st1 {v8.s}[1],[x5],x14 \n\t" // Store c31 into quad and increment by rs_c. +" st1 {v8.s}[2],[x5],x14 \n\t" // Store c32 into quad and increment by rs_c. +" st1 {v8.s}[3],[x5],x14 \n\t" // Store c33 into quad and increment by rs_c. +" st1 {v9.s}[0],[x5],x14 \n\t" // Store c34 into quad and increment by rs_c. +" st1 {v9.s}[1],[x5],x14 \n\t" // Store c35 into quad and increment by rs_c. +" st1 {v9.s}[2],[x5],x14 \n\t" // Store c36 into quad and increment by rs_c. +" st1 {v9.s}[3],[x5],x14 \n\t" // Store c37 into quad and increment by rs_c. " \n\t" -" mov x27, x19 \n\t" +" mov x5, x20 \n\t" " \n\t" -" st1 {v10.s}[0],[x27],x14 \n\t" // Store c40 into quad and increment by rs_c. -" st1 {v10.s}[1],[x27],x14 \n\t" // Store c41 into quad and increment by rs_c. -" st1 {v10.s}[2],[x27],x14 \n\t" // Store c42 into quad and increment by rs_c. -" st1 {v10.s}[3],[x27],x14 \n\t" // Store c43 into quad and increment by rs_c. -" st1 {v11.s}[0],[x27],x14 \n\t" // Store c44 into quad and increment by rs_c. -" st1 {v11.s}[1],[x27],x14 \n\t" // Store c45 into quad and increment by rs_c. -" st1 {v11.s}[2],[x27],x14 \n\t" // Store c46 into quad and increment by rs_c. -" st1 {v11.s}[3],[x27],x14 \n\t" // Store c47 into quad and increment by rs_c. +" st1 {v10.s}[0],[x5],x14 \n\t" // Store c40 into quad and increment by rs_c. +" st1 {v10.s}[1],[x5],x14 \n\t" // Store c41 into quad and increment by rs_c. +" st1 {v10.s}[2],[x5],x14 \n\t" // Store c42 into quad and increment by rs_c. +" st1 {v10.s}[3],[x5],x14 \n\t" // Store c43 into quad and increment by rs_c. +" st1 {v11.s}[0],[x5],x14 \n\t" // Store c44 into quad and increment by rs_c. +" st1 {v11.s}[1],[x5],x14 \n\t" // Store c45 into quad and increment by rs_c. +" st1 {v11.s}[2],[x5],x14 \n\t" // Store c46 into quad and increment by rs_c. +" st1 {v11.s}[3],[x5],x14 \n\t" // Store c47 into quad and increment by rs_c. " \n\t" -" mov x27, x20 \n\t" +" mov x5, x21 \n\t" " \n\t" -" st1 {v12.s}[0],[x27],x14 \n\t" // Store c50 into quad and increment by rs_c. -" st1 {v12.s}[1],[x27],x14 \n\t" // Store c51 into quad and increment by rs_c. -" st1 {v12.s}[2],[x27],x14 \n\t" // Store c52 into quad and increment by rs_c. -" st1 {v12.s}[3],[x27],x14 \n\t" // Store c53 into quad and increment by rs_c. -" st1 {v13.s}[0],[x27],x14 \n\t" // Store c54 into quad and increment by rs_c. -" st1 {v13.s}[1],[x27],x14 \n\t" // Store c55 into quad and increment by rs_c. -" st1 {v13.s}[2],[x27],x14 \n\t" // Store c56 into quad and increment by rs_c. -" st1 {v13.s}[3],[x27],x14 \n\t" // Store c57 into quad and increment by rs_c. +" st1 {v12.s}[0],[x5],x14 \n\t" // Store c50 into quad and increment by rs_c. +" st1 {v12.s}[1],[x5],x14 \n\t" // Store c51 into quad and increment by rs_c. +" st1 {v12.s}[2],[x5],x14 \n\t" // Store c52 into quad and increment by rs_c. +" st1 {v12.s}[3],[x5],x14 \n\t" // Store c53 into quad and increment by rs_c. +" st1 {v13.s}[0],[x5],x14 \n\t" // Store c54 into quad and increment by rs_c. +" st1 {v13.s}[1],[x5],x14 \n\t" // Store c55 into quad and increment by rs_c. +" st1 {v13.s}[2],[x5],x14 \n\t" // Store c56 into quad and increment by rs_c. +" st1 {v13.s}[3],[x5],x14 \n\t" // Store c57 into quad and increment by rs_c. " \n\t" " dup v0.4s, wzr \n\t" " dup v1.4s, wzr \n\t" @@ -874,38 +874,38 @@ LABEL(SBETAZEROGENSTOREDS2) " fcmp s7,#0.0 \n\t" BEQ(SBETAZEROGENSTOREDS3) // Taking care of the beta==0 case. " \n\t" -" mov x27, x21 \n\t" +" mov x5, x22 \n\t" " \n\t" -" ld1 {v0.s}[0],[x27],x14 \n\t" // Load c60 into quad and increment by rs_c. -" ld1 {v0.s}[1],[x27],x14 \n\t" // Load c61 into quad and increment by rs_c. -" ld1 {v0.s}[2],[x27],x14 \n\t" // Load c62 into quad and increment by rs_c. -" ld1 {v0.s}[3],[x27],x14 \n\t" // Load c63 into quad and increment by rs_c. -" ld1 {v1.s}[0],[x27],x14 \n\t" // Load c64 into quad and increment by rs_c. -" ld1 {v1.s}[1],[x27],x14 \n\t" // Load c65 into quad and increment by rs_c. -" ld1 {v1.s}[2],[x27],x14 \n\t" // Load c66 into quad and increment by rs_c. -" ld1 {v1.s}[3],[x27],x14 \n\t" // Load c67 into quad and increment by rs_c. +" ld1 {v0.s}[0],[x5],x14 \n\t" // Load c60 into quad and increment by rs_c. +" ld1 {v0.s}[1],[x5],x14 \n\t" // Load c61 into quad and increment by rs_c. +" ld1 {v0.s}[2],[x5],x14 \n\t" // Load c62 into quad and increment by rs_c. +" ld1 {v0.s}[3],[x5],x14 \n\t" // Load c63 into quad and increment by rs_c. +" ld1 {v1.s}[0],[x5],x14 \n\t" // Load c64 into quad and increment by rs_c. +" ld1 {v1.s}[1],[x5],x14 \n\t" // Load c65 into quad and increment by rs_c. +" ld1 {v1.s}[2],[x5],x14 \n\t" // Load c66 into quad and increment by rs_c. +" ld1 {v1.s}[3],[x5],x14 \n\t" // Load c67 into quad and increment by rs_c. " \n\t" -" mov x27, x22 \n\t" +" mov x5, x23 \n\t" " \n\t" -" ld1 {v2.s}[0],[x27],x14 \n\t" // Load c70 into quad and increment by rs_c. -" ld1 {v2.s}[1],[x27],x14 \n\t" // Load c71 into quad and increment by rs_c. -" ld1 {v2.s}[2],[x27],x14 \n\t" // Load c72 into quad and increment by rs_c. -" ld1 {v2.s}[3],[x27],x14 \n\t" // Load c73 into quad and increment by rs_c. -" ld1 {v3.s}[0],[x27],x14 \n\t" // Load c74 into quad and increment by rs_c. -" ld1 {v3.s}[1],[x27],x14 \n\t" // Load c75 into quad and increment by rs_c. -" ld1 {v3.s}[2],[x27],x14 \n\t" // Load c76 into quad and increment by rs_c. -" ld1 {v3.s}[3],[x27],x14 \n\t" // Load c77 into quad and increment by rs_c. +" ld1 {v2.s}[0],[x5],x14 \n\t" // Load c70 into quad and increment by rs_c. +" ld1 {v2.s}[1],[x5],x14 \n\t" // Load c71 into quad and increment by rs_c. +" ld1 {v2.s}[2],[x5],x14 \n\t" // Load c72 into quad and increment by rs_c. +" ld1 {v2.s}[3],[x5],x14 \n\t" // Load c73 into quad and increment by rs_c. +" ld1 {v3.s}[0],[x5],x14 \n\t" // Load c74 into quad and increment by rs_c. +" ld1 {v3.s}[1],[x5],x14 \n\t" // Load c75 into quad and increment by rs_c. +" ld1 {v3.s}[2],[x5],x14 \n\t" // Load c76 into quad and increment by rs_c. +" ld1 {v3.s}[3],[x5],x14 \n\t" // Load c77 into quad and increment by rs_c. " \n\t" -" mov x27, x23 \n\t" +" mov x5, x24 \n\t" " \n\t" -" ld1 {v4.s}[0],[x27],x14 \n\t" // Load c80 into quad and increment by rs_c. -" ld1 {v4.s}[1],[x27],x14 \n\t" // Load c81 into quad and increment by rs_c. -" ld1 {v4.s}[2],[x27],x14 \n\t" // Load c82 into quad and increment by rs_c. -" ld1 {v4.s}[3],[x27],x14 \n\t" // Load c83 into quad and increment by rs_c. -" ld1 {v5.s}[0],[x27],x14 \n\t" // Load c84 into quad and increment by rs_c. -" ld1 {v5.s}[1],[x27],x14 \n\t" // Load c85 into quad and increment by rs_c. -" ld1 {v5.s}[2],[x27],x14 \n\t" // Load c86 into quad and increment by rs_c. -" ld1 {v5.s}[3],[x27],x14 \n\t" // Load c87 into quad and increment by rs_c. +" ld1 {v4.s}[0],[x5],x14 \n\t" // Load c80 into quad and increment by rs_c. +" ld1 {v4.s}[1],[x5],x14 \n\t" // Load c81 into quad and increment by rs_c. +" ld1 {v4.s}[2],[x5],x14 \n\t" // Load c82 into quad and increment by rs_c. +" ld1 {v4.s}[3],[x5],x14 \n\t" // Load c83 into quad and increment by rs_c. +" ld1 {v5.s}[0],[x5],x14 \n\t" // Load c84 into quad and increment by rs_c. +" ld1 {v5.s}[1],[x5],x14 \n\t" // Load c85 into quad and increment by rs_c. +" ld1 {v5.s}[2],[x5],x14 \n\t" // Load c86 into quad and increment by rs_c. +" ld1 {v5.s}[3],[x5],x14 \n\t" // Load c87 into quad and increment by rs_c. " \n\t" " fmul v0.4s,v0.4s,v7.s[0] \n\t" // Scale by beta " fmul v1.4s,v1.4s,v7.s[0] \n\t" // Scale by beta @@ -923,38 +923,38 @@ LABEL(SBETAZEROGENSTOREDS3) " fmla v4.4s,v24.4s,v6.s[0] \n\t" // Scale by alpha " fmla v5.4s,v25.4s,v6.s[0] \n\t" // Scale by alpha " \n\t" -" mov x27, x21 \n\t" +" mov x5, x22 \n\t" " \n\t" -" st1 {v0.s}[0],[x27],x14 \n\t" // Store c60 into quad and increment by rs_c. -" st1 {v0.s}[1],[x27],x14 \n\t" // Store c61 into quad and increment by rs_c. -" st1 {v0.s}[2],[x27],x14 \n\t" // Store c62 into quad and increment by rs_c. -" st1 {v0.s}[3],[x27],x14 \n\t" // Store c63 into quad and increment by rs_c. -" st1 {v1.s}[0],[x27],x14 \n\t" // Store c64 into quad and increment by rs_c. -" st1 {v1.s}[1],[x27],x14 \n\t" // Store c65 into quad and increment by rs_c. -" st1 {v1.s}[2],[x27],x14 \n\t" // Store c66 into quad and increment by rs_c. -" st1 {v1.s}[3],[x27],x14 \n\t" // Store c67 into quad and increment by rs_c. +" st1 {v0.s}[0],[x5],x14 \n\t" // Store c60 into quad and increment by rs_c. +" st1 {v0.s}[1],[x5],x14 \n\t" // Store c61 into quad and increment by rs_c. +" st1 {v0.s}[2],[x5],x14 \n\t" // Store c62 into quad and increment by rs_c. +" st1 {v0.s}[3],[x5],x14 \n\t" // Store c63 into quad and increment by rs_c. +" st1 {v1.s}[0],[x5],x14 \n\t" // Store c64 into quad and increment by rs_c. +" st1 {v1.s}[1],[x5],x14 \n\t" // Store c65 into quad and increment by rs_c. +" st1 {v1.s}[2],[x5],x14 \n\t" // Store c66 into quad and increment by rs_c. +" st1 {v1.s}[3],[x5],x14 \n\t" // Store c67 into quad and increment by rs_c. " \n\t" -" mov x27, x22 \n\t" +" mov x5, x23 \n\t" " \n\t" -" st1 {v2.s}[0],[x27],x14 \n\t" // Store c70 into quad and increment by rs_c. -" st1 {v2.s}[1],[x27],x14 \n\t" // Store c71 into quad and increment by rs_c. -" st1 {v2.s}[2],[x27],x14 \n\t" // Store c72 into quad and increment by rs_c. -" st1 {v2.s}[3],[x27],x14 \n\t" // Store c73 into quad and increment by rs_c. -" st1 {v3.s}[0],[x27],x14 \n\t" // Store c74 into quad and increment by rs_c. -" st1 {v3.s}[1],[x27],x14 \n\t" // Store c75 into quad and increment by rs_c. -" st1 {v3.s}[2],[x27],x14 \n\t" // Store c76 into quad and increment by rs_c. -" st1 {v3.s}[3],[x27],x14 \n\t" // Store c77 into quad and increment by rs_c. +" st1 {v2.s}[0],[x5],x14 \n\t" // Store c70 into quad and increment by rs_c. +" st1 {v2.s}[1],[x5],x14 \n\t" // Store c71 into quad and increment by rs_c. +" st1 {v2.s}[2],[x5],x14 \n\t" // Store c72 into quad and increment by rs_c. +" st1 {v2.s}[3],[x5],x14 \n\t" // Store c73 into quad and increment by rs_c. +" st1 {v3.s}[0],[x5],x14 \n\t" // Store c74 into quad and increment by rs_c. +" st1 {v3.s}[1],[x5],x14 \n\t" // Store c75 into quad and increment by rs_c. +" st1 {v3.s}[2],[x5],x14 \n\t" // Store c76 into quad and increment by rs_c. +" st1 {v3.s}[3],[x5],x14 \n\t" // Store c77 into quad and increment by rs_c. " \n\t" -" mov x27, x23 \n\t" +" mov x5, x24 \n\t" " \n\t" -" st1 {v4.s}[0],[x27],x14 \n\t" // Store c80 into quad and increment by rs_c. -" st1 {v4.s}[1],[x27],x14 \n\t" // Store c81 into quad and increment by rs_c. -" st1 {v4.s}[2],[x27],x14 \n\t" // Store c82 into quad and increment by rs_c. -" st1 {v4.s}[3],[x27],x14 \n\t" // Store c83 into quad and increment by rs_c. -" st1 {v5.s}[0],[x27],x14 \n\t" // Store c84 into quad and increment by rs_c. -" st1 {v5.s}[1],[x27],x14 \n\t" // Store c85 into quad and increment by rs_c. -" st1 {v5.s}[2],[x27],x14 \n\t" // Store c86 into quad and increment by rs_c. -" st1 {v5.s}[3],[x27],x14 \n\t" // Store c87 into quad and increment by rs_c. +" st1 {v4.s}[0],[x5],x14 \n\t" // Store c80 into quad and increment by rs_c. +" st1 {v4.s}[1],[x5],x14 \n\t" // Store c81 into quad and increment by rs_c. +" st1 {v4.s}[2],[x5],x14 \n\t" // Store c82 into quad and increment by rs_c. +" st1 {v4.s}[3],[x5],x14 \n\t" // Store c83 into quad and increment by rs_c. +" st1 {v5.s}[0],[x5],x14 \n\t" // Store c84 into quad and increment by rs_c. +" st1 {v5.s}[1],[x5],x14 \n\t" // Store c85 into quad and increment by rs_c. +" st1 {v5.s}[2],[x5],x14 \n\t" // Store c86 into quad and increment by rs_c. +" st1 {v5.s}[3],[x5],x14 \n\t" // Store c87 into quad and increment by rs_c. " \n\t" " dup v8.4s, wzr \n\t" " dup v9.4s, wzr \n\t" @@ -966,38 +966,38 @@ LABEL(SBETAZEROGENSTOREDS3) " fcmp s7,#0.0 \n\t" BEQ(SBETAZEROGENSTOREDS4) // Taking care of the beta==0 case. " \n\t" -" mov x27, x24 \n\t" +" mov x5, x25 \n\t" " \n\t" -" ld1 {v8.s}[0],[x27],x14 \n\t" // Load c90 into quad and increment by rs_c. -" ld1 {v8.s}[1],[x27],x14 \n\t" // Load c91 into quad and increment by rs_c. -" ld1 {v8.s}[2],[x27],x14 \n\t" // Load c92 into quad and increment by rs_c. -" ld1 {v8.s}[3],[x27],x14 \n\t" // Load c93 into quad and increment by rs_c. -" ld1 {v9.s}[0],[x27],x14 \n\t" // Load c94 into quad and increment by rs_c. -" ld1 {v9.s}[1],[x27],x14 \n\t" // Load c95 into quad and increment by rs_c. -" ld1 {v9.s}[2],[x27],x14 \n\t" // Load c96 into quad and increment by rs_c. -" ld1 {v9.s}[3],[x27],x14 \n\t" // Load c97 into quad and increment by rs_c. +" ld1 {v8.s}[0],[x5],x14 \n\t" // Load c90 into quad and increment by rs_c. +" ld1 {v8.s}[1],[x5],x14 \n\t" // Load c91 into quad and increment by rs_c. +" ld1 {v8.s}[2],[x5],x14 \n\t" // Load c92 into quad and increment by rs_c. +" ld1 {v8.s}[3],[x5],x14 \n\t" // Load c93 into quad and increment by rs_c. +" ld1 {v9.s}[0],[x5],x14 \n\t" // Load c94 into quad and increment by rs_c. +" ld1 {v9.s}[1],[x5],x14 \n\t" // Load c95 into quad and increment by rs_c. +" ld1 {v9.s}[2],[x5],x14 \n\t" // Load c96 into quad and increment by rs_c. +" ld1 {v9.s}[3],[x5],x14 \n\t" // Load c97 into quad and increment by rs_c. " \n\t" -" mov x27, x25 \n\t" +" mov x5, x26 \n\t" " \n\t" -" ld1 {v10.s}[0],[x27],x14 \n\t" // Load c100 into quad and increment by rs_c. -" ld1 {v10.s}[1],[x27],x14 \n\t" // Load c101 into quad and increment by rs_c. -" ld1 {v10.s}[2],[x27],x14 \n\t" // Load c102 into quad and increment by rs_c. -" ld1 {v10.s}[3],[x27],x14 \n\t" // Load c103 into quad and increment by rs_c. -" ld1 {v11.s}[0],[x27],x14 \n\t" // Load c104 into quad and increment by rs_c. -" ld1 {v11.s}[1],[x27],x14 \n\t" // Load c105 into quad and increment by rs_c. -" ld1 {v11.s}[2],[x27],x14 \n\t" // Load c106 into quad and increment by rs_c. -" ld1 {v11.s}[3],[x27],x14 \n\t" // Load c107 into quad and increment by rs_c. +" ld1 {v10.s}[0],[x5],x14 \n\t" // Load c100 into quad and increment by rs_c. +" ld1 {v10.s}[1],[x5],x14 \n\t" // Load c101 into quad and increment by rs_c. +" ld1 {v10.s}[2],[x5],x14 \n\t" // Load c102 into quad and increment by rs_c. +" ld1 {v10.s}[3],[x5],x14 \n\t" // Load c103 into quad and increment by rs_c. +" ld1 {v11.s}[0],[x5],x14 \n\t" // Load c104 into quad and increment by rs_c. +" ld1 {v11.s}[1],[x5],x14 \n\t" // Load c105 into quad and increment by rs_c. +" ld1 {v11.s}[2],[x5],x14 \n\t" // Load c106 into quad and increment by rs_c. +" ld1 {v11.s}[3],[x5],x14 \n\t" // Load c107 into quad and increment by rs_c. " \n\t" -" mov x27, x26 \n\t" +" mov x5, x27 \n\t" " \n\t" -" ld1 {v12.s}[0],[x27],x14 \n\t" // Load c110 into quad and increment by rs_c. -" ld1 {v12.s}[1],[x27],x14 \n\t" // Load c111 into quad and increment by rs_c. -" ld1 {v12.s}[2],[x27],x14 \n\t" // Load c112 into quad and increment by rs_c. -" ld1 {v12.s}[3],[x27],x14 \n\t" // Load c113 into quad and increment by rs_c. -" ld1 {v13.s}[0],[x27],x14 \n\t" // Load c114 into quad and increment by rs_c. -" ld1 {v13.s}[1],[x27],x14 \n\t" // Load c115 into quad and increment by rs_c. -" ld1 {v13.s}[2],[x27],x14 \n\t" // Load c116 into quad and increment by rs_c. -" ld1 {v13.s}[3],[x27],x14 \n\t" // Load c117 into quad and increment by rs_c. +" ld1 {v12.s}[0],[x5],x14 \n\t" // Load c110 into quad and increment by rs_c. +" ld1 {v12.s}[1],[x5],x14 \n\t" // Load c111 into quad and increment by rs_c. +" ld1 {v12.s}[2],[x5],x14 \n\t" // Load c112 into quad and increment by rs_c. +" ld1 {v12.s}[3],[x5],x14 \n\t" // Load c113 into quad and increment by rs_c. +" ld1 {v13.s}[0],[x5],x14 \n\t" // Load c114 into quad and increment by rs_c. +" ld1 {v13.s}[1],[x5],x14 \n\t" // Load c115 into quad and increment by rs_c. +" ld1 {v13.s}[2],[x5],x14 \n\t" // Load c116 into quad and increment by rs_c. +" ld1 {v13.s}[3],[x5],x14 \n\t" // Load c117 into quad and increment by rs_c. " \n\t" " fmul v8.4s, v8.4s, v7.s[0] \n\t" // Scale by beta " fmul v9.4s, v9.4s, v7.s[0] \n\t" // Scale by beta @@ -1018,38 +1018,38 @@ LABEL(SBETAZEROGENSTOREDS4) " fmla v12.4s,v30.4s,v6.s[0] \n\t" // Scale by alpha " fmla v13.4s,v31.4s,v6.s[0] \n\t" // Scale by alpha " \n\t" -" mov x27, x24 \n\t" +" mov x5, x25 \n\t" " \n\t" -" st1 {v8.s}[0],[x27],x14 \n\t" // Store c90 into quad and increment by rs_c. -" st1 {v8.s}[1],[x27],x14 \n\t" // Store c91 into quad and increment by rs_c. -" st1 {v8.s}[2],[x27],x14 \n\t" // Store c92 into quad and increment by rs_c. -" st1 {v8.s}[3],[x27],x14 \n\t" // Store c93 into quad and increment by rs_c. -" st1 {v9.s}[0],[x27],x14 \n\t" // Store c94 into quad and increment by rs_c. -" st1 {v9.s}[1],[x27],x14 \n\t" // Store c95 into quad and increment by rs_c. -" st1 {v9.s}[2],[x27],x14 \n\t" // Store c96 into quad and increment by rs_c. -" st1 {v9.s}[3],[x27],x14 \n\t" // Store c97 into quad and increment by rs_c. +" st1 {v8.s}[0],[x5],x14 \n\t" // Store c90 into quad and increment by rs_c. +" st1 {v8.s}[1],[x5],x14 \n\t" // Store c91 into quad and increment by rs_c. +" st1 {v8.s}[2],[x5],x14 \n\t" // Store c92 into quad and increment by rs_c. +" st1 {v8.s}[3],[x5],x14 \n\t" // Store c93 into quad and increment by rs_c. +" st1 {v9.s}[0],[x5],x14 \n\t" // Store c94 into quad and increment by rs_c. +" st1 {v9.s}[1],[x5],x14 \n\t" // Store c95 into quad and increment by rs_c. +" st1 {v9.s}[2],[x5],x14 \n\t" // Store c96 into quad and increment by rs_c. +" st1 {v9.s}[3],[x5],x14 \n\t" // Store c97 into quad and increment by rs_c. " \n\t" -" mov x27, x25 \n\t" +" mov x5, x26 \n\t" " \n\t" -" st1 {v10.s}[0],[x27],x14 \n\t" // Store c100 into quad and increment by rs_c. -" st1 {v10.s}[1],[x27],x14 \n\t" // Store c101 into quad and increment by rs_c. -" st1 {v10.s}[2],[x27],x14 \n\t" // Store c102 into quad and increment by rs_c. -" st1 {v10.s}[3],[x27],x14 \n\t" // Store c103 into quad and increment by rs_c. -" st1 {v11.s}[0],[x27],x14 \n\t" // Store c104 into quad and increment by rs_c. -" st1 {v11.s}[1],[x27],x14 \n\t" // Store c105 into quad and increment by rs_c. -" st1 {v11.s}[2],[x27],x14 \n\t" // Store c106 into quad and increment by rs_c. -" st1 {v11.s}[3],[x27],x14 \n\t" // Store c107 into quad and increment by rs_c. +" st1 {v10.s}[0],[x5],x14 \n\t" // Store c100 into quad and increment by rs_c. +" st1 {v10.s}[1],[x5],x14 \n\t" // Store c101 into quad and increment by rs_c. +" st1 {v10.s}[2],[x5],x14 \n\t" // Store c102 into quad and increment by rs_c. +" st1 {v10.s}[3],[x5],x14 \n\t" // Store c103 into quad and increment by rs_c. +" st1 {v11.s}[0],[x5],x14 \n\t" // Store c104 into quad and increment by rs_c. +" st1 {v11.s}[1],[x5],x14 \n\t" // Store c105 into quad and increment by rs_c. +" st1 {v11.s}[2],[x5],x14 \n\t" // Store c106 into quad and increment by rs_c. +" st1 {v11.s}[3],[x5],x14 \n\t" // Store c107 into quad and increment by rs_c. " \n\t" -" mov x27, x26 \n\t" +" mov x5, x27 \n\t" " \n\t" -" st1 {v12.s}[0],[x27],x14 \n\t" // Store c110 into quad and increment by rs_c. -" st1 {v12.s}[1],[x27],x14 \n\t" // Store c111 into quad and increment by rs_c. -" st1 {v12.s}[2],[x27],x14 \n\t" // Store c112 into quad and increment by rs_c. -" st1 {v12.s}[3],[x27],x14 \n\t" // Store c113 into quad and increment by rs_c. -" st1 {v13.s}[0],[x27],x14 \n\t" // Store c114 into quad and increment by rs_c. -" st1 {v13.s}[1],[x27],x14 \n\t" // Store c115 into quad and increment by rs_c. -" st1 {v13.s}[2],[x27],x14 \n\t" // Store c116 into quad and increment by rs_c. -" st1 {v13.s}[3],[x27],x14 \n\t" // Store c147 into quad and increment by rs_c. +" st1 {v12.s}[0],[x5],x14 \n\t" // Store c110 into quad and increment by rs_c. +" st1 {v12.s}[1],[x5],x14 \n\t" // Store c111 into quad and increment by rs_c. +" st1 {v12.s}[2],[x5],x14 \n\t" // Store c112 into quad and increment by rs_c. +" st1 {v12.s}[3],[x5],x14 \n\t" // Store c113 into quad and increment by rs_c. +" st1 {v13.s}[0],[x5],x14 \n\t" // Store c114 into quad and increment by rs_c. +" st1 {v13.s}[1],[x5],x14 \n\t" // Store c115 into quad and increment by rs_c. +" st1 {v13.s}[2],[x5],x14 \n\t" // Store c116 into quad and increment by rs_c. +" st1 {v13.s}[3],[x5],x14 \n\t" // Store c147 into quad and increment by rs_c. " \n\t" LABEL(SEND) // Done! " \n\t" @@ -1068,11 +1068,10 @@ LABEL(SEND) // Done! [b_next] "m" (b_next) // 10 :// Register clobber list "x0", "x1", "x2", - "x5", "x6", "x10", - "x14", - "x16","x17","x18","x19", - "x20","x21","x22","x23", - "x24","x25","x26","x27", + "x5", "x6", "x10","x14", + "x16","x17","x19","x20", + "x21","x22","x23","x24", + "x25","x26","x27", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10","v11",