diff --git a/kernels/armv8a/3/armv8a_asm_utils.h b/kernels/armv8a/3/armv8a_asm_utils.h new file mode 100644 index 000000000..7bf97d555 --- /dev/null +++ b/kernels/armv8a/3/armv8a_asm_utils.h @@ -0,0 +1,49 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2021, The University of Tokyo + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +*/ + +// Apple's local label requirements. +#if defined(__APPLE__) +#define LABEL(str) " L" #str": \n\t" +#define BEQ(str) "b.eq L" #str" \n\t" +#define BNE(str) "b.ne L" #str" \n\t" +#define BRANCH(str) "b L" #str" \n\t" +#else +#define LABEL(str) " ." #str": \n\t" +#define BEQ(str) "b.eq ." #str" \n\t" +#define BNE(str) "b.ne ." #str" \n\t" +#define BRANCH(str) "b ." #str" \n\t" +#endif + diff --git a/kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c b/kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c index c01c67f5a..251931f7c 100644 --- a/kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c +++ b/kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c @@ -34,6 +34,7 @@ */ #include "blis.h" +#include "armv8a_asm_utils.h" /* o 4x4 Single precision micro-kernel fully functional. @@ -155,7 +156,7 @@ __asm__ volatile " dup v31.4s, wzr \n\t" // Vector for accummulating column 11 " \n\t" " cmp x5,#0 \n\t" // If k_iter == 0, jump to k_left. -" beq .SCONSIDERKLEFT \n\t" +BEQ(SCONSIDERKLEFT) " \n\t" " ldr q0, [x0] \n\t" " ldr q1, [x0, #16] \n\t" // Load a @@ -168,9 +169,9 @@ __asm__ volatile " add x1, x1, #48 \n\t" //update address of B " \n\t" " cmp x5,1 \n\t" // If there is just one k_iter, jump to that one. -" beq .SLASTITER \n\t" // (as loop is do-while-like). +BEQ(SLASTITER) // (as loop is do-while-like). " \n\t" -" .SLOOPKITER: \n\t" // Body of the k_iter loop. +LABEL(SLOOPKITER) // Body of the k_iter loop. " \n\t" " ldr q5, [x0] \n\t" " fmla v8.4s, v0.4s,v2.s[0] \n\t" // Accummulate. @@ -316,9 +317,9 @@ __asm__ volatile " \n\t" //End It 4 " sub x5,x5,1 \n\t" // i-=1. " cmp x5,1 \n\t" // Iterate again if we are not in k_iter == 1. -" bne .SLOOPKITER \n\t" +BNE(SLOOPKITER) " \n\t" -" .SLASTITER: \n\t" // Last iteration of k_iter loop. +LABEL(SLASTITER) // Last iteration of k_iter loop. " \n\t" " \n\t" " ldr q5, [x0] \n\t" @@ -454,11 +455,11 @@ __asm__ volatile " add x0, x0, #96 \n\t" " \n\t" //End It 4 " \n\t" -" .SCONSIDERKLEFT: \n\t" +LABEL(SCONSIDERKLEFT) " cmp x6,0 \n\t" // If k_left == 0, we are done. -" beq .SPOSTACCUM \n\t" // else, we enter the k_left loop. +BEQ(SPOSTACCUM) // else, we enter the k_left loop. " \n\t" -" .SLOOPKLEFT: \n\t" // Body of the left iterations +LABEL(SLOOPKLEFT) // Body of the left iterations " \n\t" " ldr q0, [x0],#16 \n\t" " ldr q1, [x0],#16 \n\t" // Load a @@ -497,17 +498,17 @@ __asm__ volatile " fmla v31.4s,v1.4s,v4.s[3] \n\t" // Accummulate. " \n\t" " cmp x6,0 \n\t" // Iterate again. -" bne .SLOOPKLEFT \n\t" // if i!=0. +BNE(SLOOPKLEFT) // if i!=0. " \n\t" -" .SPOSTACCUM: \n\t" +LABEL(SPOSTACCUM) " \n\t" " ld1r {v6.4s},[x7] \n\t" // Load alpha. " ld1r {v7.4s},[x8] \n\t" // Load beta " \n\t" " cmp x13,#1 \n\t" // If rs_c != 1 (column-major) -" bne .SGENSTORED \n\t" +BNE(SGENSTORED) " \n\t" -" .SCOLSTORED: \n\t" // C is column-major. +LABEL(SCOLSTORED) // C is column-major. " \n\t" " dup v0.4s, wzr \n\t" " dup v1.4s, wzr \n\t" @@ -517,7 +518,7 @@ __asm__ volatile " dup v5.4s, wzr \n\t" " \n\t" " fcmp s7,#0.0 \n\t" -" beq .SBETAZEROCOLSTOREDS1 \n\t" // Taking care of the beta==0 case. +BEQ(SBETAZEROCOLSTOREDS1) // Taking care of the beta==0 case. " \n\t" " ldr q0, [x2] \n\t" //Load column 0 of C " ldr q1, [x2, #16] \n\t" @@ -533,7 +534,7 @@ __asm__ volatile " fmul v4.4s,v4.4s,v7.s[0] \n\t" // Scale by beta " fmul v5.4s,v5.4s,v7.s[0] \n\t" // Scale by beta " \n\t" -" .SBETAZEROCOLSTOREDS1: \n\t" +LABEL(SBETAZEROCOLSTOREDS1) " \n\t" " fmla v0.4s,v8.4s,v6.s[0] \n\t" // Scale by alpha " fmla v1.4s,v9.4s,v6.s[0] \n\t" // Scale by alpha @@ -557,7 +558,7 @@ __asm__ volatile " dup v13.4s, wzr \n\t" " \n\t" " fcmp s7,#0.0 \n\t" -" beq .SBETAZEROCOLSTOREDS2 \n\t" // Taking care of the beta==0 case. +BEQ(SBETAZEROCOLSTOREDS2) // Taking care of the beta==0 case. " \n\t" " ldr q8, [x18] \n\t" //Load column 3 of C " ldr q9, [x18, #16] \n\t" @@ -573,7 +574,7 @@ __asm__ volatile " fmul v12.4s,v12.4s,v7.s[0] \n\t" // Scale by beta " fmul v13.4s,v13.4s,v7.s[0] \n\t" // Scale by beta " \n\t" -" .SBETAZEROCOLSTOREDS2: \n\t" +LABEL(SBETAZEROCOLSTOREDS2) " \n\t" " fmla v8.4s, v14.4s,v6.s[0] \n\t" // Scale by alpha " fmla v9.4s, v15.4s,v6.s[0] \n\t" // Scale by alpha @@ -597,7 +598,7 @@ __asm__ volatile " dup v5.4s, wzr \n\t" " \n\t" " fcmp s7,#0.0 \n\t" -" beq .SBETAZEROCOLSTOREDS3 \n\t" // Taking care of the beta==0 case. +BEQ(SBETAZEROCOLSTOREDS3) // Taking care of the beta==0 case. " \n\t" " ldr q0, [x21] \n\t" //Load column 6 of C " ldr q1, [x21, #16] \n\t" @@ -613,7 +614,7 @@ __asm__ volatile " fmul v4.4s,v4.4s,v7.s[0] \n\t" // Scale by beta " fmul v5.4s,v5.4s,v7.s[0] \n\t" // Scale by beta " \n\t" -" .SBETAZEROCOLSTOREDS3: \n\t" +LABEL(SBETAZEROCOLSTOREDS3) " \n\t" " fmla v0.4s,v20.4s,v6.s[0] \n\t" // Scale by alpha " fmla v1.4s,v21.4s,v6.s[0] \n\t" // Scale by alpha @@ -637,7 +638,7 @@ __asm__ volatile " dup v13.4s, wzr \n\t" " \n\t" " fcmp s7,#0.0 \n\t" -" beq .SBETAZEROCOLSTOREDS4 \n\t" // Taking care of the beta==0 case. +BEQ(SBETAZEROCOLSTOREDS4) // Taking care of the beta==0 case. " \n\t" " ldr q8, [x24] \n\t" //Load column 9 of C " ldr q9, [x24, #16] \n\t" @@ -653,7 +654,7 @@ __asm__ volatile " fmul v12.4s,v12.4s,v7.s[0] \n\t" // Scale by beta " fmul v13.4s,v13.4s,v7.s[0] \n\t" // Scale by beta " \n\t" -" .SBETAZEROCOLSTOREDS4: \n\t" +LABEL(SBETAZEROCOLSTOREDS4) " \n\t" " prfm pldl2keep,[x3] \n\t" " prfm pldl2keep,[x4] \n\t" @@ -673,10 +674,10 @@ __asm__ volatile " str q13, [x26, #16] \n\t" " \n\t" " \n\t" -" b .SEND \n\t" // Done (TODO: this obviously needs to be moved down to remove jump). +BRANCH(SEND) // Done. " \n\t" " \n\t" -" .SGENSTORED: \n\t" // C is general-stride stored. +LABEL(SGENSTORED) // C is general-stride stored. " \n\t" " \n\t" " dup v0.4s, wzr \n\t" @@ -687,7 +688,7 @@ __asm__ volatile " dup v5.4s, wzr \n\t" " \n\t" " fcmp s7,#0.0 \n\t" -" beq .SBETAZEROGENSTOREDS1 \n\t" // Taking care of the beta==0 case. +BEQ(SBETAZEROGENSTOREDS1) // Taking care of the beta==0 case. " \n\t" " mov x27, x2 \n\t" " \n\t" @@ -729,7 +730,7 @@ __asm__ volatile " fmul v4.4s,v4.4s,v7.s[0] \n\t" // Scale by beta " fmul v5.4s,v5.4s,v7.s[0] \n\t" // Scale by beta " \n\t" -" .SBETAZEROGENSTOREDS1: \n\t" +LABEL(SBETAZEROGENSTOREDS1) " \n\t" " fmla v0.4s, v8.4s,v6.s[0] \n\t" // Scale by alpha " fmla v1.4s, v9.4s,v6.s[0] \n\t" // Scale by alpha @@ -779,7 +780,7 @@ __asm__ volatile " dup v13.4s, wzr \n\t" " \n\t" " fcmp s7,#0.0 \n\t" -" beq .SBETAZEROGENSTOREDS2 \n\t" // Taking care of the beta==0 case. +BEQ(SBETAZEROGENSTOREDS2) // Taking care of the beta==0 case. " \n\t" " mov x27, x18 \n\t" " \n\t" @@ -821,7 +822,7 @@ __asm__ volatile " fmul v12.4s,v12.4s,v7.s[0] \n\t" // Scale by beta " fmul v13.4s,v13.4s,v7.s[0] \n\t" // Scale by beta " \n\t" -" .SBETAZEROGENSTOREDS2: \n\t" +LABEL(SBETAZEROGENSTOREDS2) " \n\t" " fmla v8.4s, v14.4s,v6.s[0] \n\t" // Scale by alpha " fmla v9.4s, v15.4s,v6.s[0] \n\t" // Scale by alpha @@ -871,7 +872,7 @@ __asm__ volatile " dup v5.4s, wzr \n\t" " \n\t" " fcmp s7,#0.0 \n\t" -" beq .SBETAZEROGENSTOREDS3 \n\t" // Taking care of the beta==0 case. +BEQ(SBETAZEROGENSTOREDS3) // Taking care of the beta==0 case. " \n\t" " mov x27, x21 \n\t" " \n\t" @@ -913,7 +914,7 @@ __asm__ volatile " fmul v4.4s,v4.4s,v7.s[0] \n\t" // Scale by beta " fmul v5.4s,v5.4s,v7.s[0] \n\t" // Scale by beta " \n\t" -" .SBETAZEROGENSTOREDS3: \n\t" +LABEL(SBETAZEROGENSTOREDS3) " \n\t" " fmla v0.4s,v20.4s,v6.s[0] \n\t" // Scale by alpha " fmla v1.4s,v21.4s,v6.s[0] \n\t" // Scale by alpha @@ -963,7 +964,7 @@ __asm__ volatile " dup v13.4s, wzr \n\t" " \n\t" " fcmp s7,#0.0 \n\t" -" beq .SBETAZEROGENSTOREDS4 \n\t" // Taking care of the beta==0 case. +BEQ(SBETAZEROGENSTOREDS4) // Taking care of the beta==0 case. " \n\t" " mov x27, x24 \n\t" " \n\t" @@ -1005,7 +1006,7 @@ __asm__ volatile " fmul v12.4s,v12.4s,v7.s[0] \n\t" // Scale by beta " fmul v13.4s,v13.4s,v7.s[0] \n\t" // Scale by beta " \n\t" -" .SBETAZEROGENSTOREDS4: \n\t" +LABEL(SBETAZEROGENSTOREDS4) " \n\t" " prfm pldl2keep,[x3] \n\t" " prfm pldl2keep,[x4] \n\t" @@ -1050,7 +1051,7 @@ __asm__ volatile " st1 {v13.s}[2],[x27],x14 \n\t" // Store c116 into quad and increment by rs_c. " st1 {v13.s}[3],[x27],x14 \n\t" // Store c147 into quad and increment by rs_c. " \n\t" -" .SEND: \n\t" // Done! +LABEL(SEND) // Done! " \n\t" :// output operands (none) :// input operands @@ -1203,7 +1204,7 @@ __asm__ volatile " \n\t" " \n\t" " cmp x5,#0 \n\t" // If k_iter == 0, jump to k_left. -" beq .DCONSIDERKLEFT \n\t" +BEQ(DCONSIDERKLEFT) " \n\t" " ldr q0, [x0] \n\t" // Load a " ldr q1, [x0, #16] \n\t" @@ -1218,9 +1219,9 @@ __asm__ volatile " add x1, x1, #64 \n\t" //update address of B " \n\t" " cmp x5,1 \n\t" // If there is just one k_iter, jump to that one. -" beq .DLASTITER \n\t" // (as loop is do-while-like). +BEQ(DLASTITER) // (as loop is do-while-like). " \n\t" -" DLOOP: \n\t" // Body +LABEL(DLOOP) // Body " \n\t" " fmla v8.2d ,v0.2d,v3.d[0] \n\t" // Accummulate " prfm PLDL1KEEP, [x1, #448] \n\t" //512-64=448 @@ -1394,9 +1395,9 @@ __asm__ volatile " \n\t" " sub x5,x5,1 \n\t" // i-=1 " cmp x5,1 \n\t" // Iterate again if we are not in k_iter == 1. -" bne DLOOP \n\t" +BNE(DLOOP) " \n\t" -".DLASTITER: \n\t" +LABEL(DLASTITER) " \n\t" " fmla v8.2d ,v0.2d,v3.d[0] \n\t" // Accummulate " fmla v9.2d ,v1.2d,v3.d[0] \n\t" // Accummulate @@ -1554,11 +1555,11 @@ __asm__ volatile " \n\t" //End it 4 " add x0, x0, #144 \n\t" " \n\t" -" .DCONSIDERKLEFT: \n\t" +LABEL(DCONSIDERKLEFT) " cmp x6,0 \n\t" // If k_left == 0, we are done. -" beq .DPOSTACCUM \n\t" // else, we enter the k_left loop. +BEQ(DPOSTACCUM) // else, we enter the k_left loop. " \n\t" -".DLOOPKLEFT: \n\t" +LABEL(DLOOPKLEFT) " \n\t" " ldr q0, [x0],#16 \n\t" " ldr q1, [x0],#16 \n\t" // Load a @@ -1605,17 +1606,17 @@ __asm__ volatile " fmla v31.2d,v2.2d,v6.d[1] \n\t" // Accummulate " \n\t" " cmp x6,0 \n\t" // Iterate again. -" bne .DLOOPKLEFT \n\t" // if i!=0. +BNE(DLOOPKLEFT) // if i!=0. " \n\t" -" .DPOSTACCUM: \n\t" +LABEL(DPOSTACCUM) " \n\t" " ld1r {v6.2d},[x7] \n\t" // Load alpha. " ld1r {v7.2d},[x8] \n\t" // Load beta " \n\t" " cmp x13,#1 \n\t" // If rs_c != 1 (column-major) -" bne .DGENSTORED \n\t" +BNE(DGENSTORED) " \n\t" -" .DCOLSTORED: \n\t" // C is column-major. +LABEL(DCOLSTORED) // C is column-major. " \n\t" " dup v0.2d, xzr \n\t" " dup v1.2d, xzr \n\t" @@ -1625,7 +1626,7 @@ __asm__ volatile " dup v5.2d, xzr \n\t" " \n\t" " fcmp d7,#0.0 \n\t" -" beq .DBETAZEROCOLSTOREDS1 \n\t" // Taking care of the beta==0 case. +BEQ(DBETAZEROCOLSTOREDS1) // Taking care of the beta==0 case. " \n\t" " ldr q0, [x2] \n\t" //Load column 0 of C " ldr q1, [x2, #16] \n\t" @@ -1642,7 +1643,7 @@ __asm__ volatile " fmul v4.2d,v4.2d,v7.d[0] \n\t" // Scale by beta " fmul v5.2d,v5.2d,v7.d[0] \n\t" // Scale by beta " \n\t" -" .DBETAZEROCOLSTOREDS1: \n\t" +LABEL(DBETAZEROCOLSTOREDS1) " \n\t" " fmla v0.2d,v8.2d,v6.d[0] \n\t" // Scale by alpha " fmla v1.2d,v9.2d,v6.d[0] \n\t" // Scale by alpha @@ -1667,7 +1668,7 @@ __asm__ volatile " dup v13.2d, xzr \n\t" " \n\t" " fcmp d7,#0.0 \n\t" -" beq .DBETAZEROCOLSTOREDS2 \n\t" // Taking care of the beta==0 case. +BEQ(DBETAZEROCOLSTOREDS2) // Taking care of the beta==0 case. " \n\t" " ldr q8, [x21] \n\t" //Load column 2 of C " ldr q9, [x21, #16] \n\t" @@ -1684,7 +1685,7 @@ __asm__ volatile " fmul v12.2d,v12.2d,v7.d[0] \n\t" // Scale by beta " fmul v13.2d,v13.2d,v7.d[0] \n\t" // Scale by beta " \n\t" -" .DBETAZEROCOLSTOREDS2: \n\t" +LABEL(DBETAZEROCOLSTOREDS2) " \n\t" " fmla v8.2d, v14.2d,v6.d[0] \n\t" // Scale by alpha " fmla v9.2d, v15.2d,v6.d[0] \n\t" // Scale by alpha @@ -1709,7 +1710,7 @@ __asm__ volatile " dup v5.2d, xzr \n\t" " \n\t" " fcmp d7,#0.0 \n\t" -" beq .DBETAZEROCOLSTOREDS3 \n\t" // Taking care of the beta==0 case. +BEQ(DBETAZEROCOLSTOREDS3) // Taking care of the beta==0 case. " \n\t" " ldr q0, [x23] \n\t" //Load column 4 of C " ldr q1, [x23, #16] \n\t" @@ -1726,7 +1727,7 @@ __asm__ volatile " fmul v4.2d,v4.2d,v7.d[0] \n\t" // Scale by beta " fmul v5.2d,v5.2d,v7.d[0] \n\t" // Scale by beta " \n\t" -" .DBETAZEROCOLSTOREDS3: \n\t" +LABEL(DBETAZEROCOLSTOREDS3) " \n\t" " fmla v0.2d,v20.2d,v6.d[0] \n\t" // Scale by alpha " fmla v1.2d,v21.2d,v6.d[0] \n\t" // Scale by alpha @@ -1751,7 +1752,7 @@ __asm__ volatile " dup v13.2d, xzr \n\t" " \n\t" " fcmp d7,#0.0 \n\t" -" beq .DBETAZEROCOLSTOREDS4 \n\t" // Taking care of the beta==0 case. +BEQ(DBETAZEROCOLSTOREDS4) // Taking care of the beta==0 case. " \n\t" " ldr q8, [x25] \n\t" //Load column 6 of C " ldr q9, [x25, #16] \n\t" @@ -1768,7 +1769,7 @@ __asm__ volatile " fmul v12.2d,v12.2d,v7.d[0] \n\t" // Scale by beta " fmul v13.2d,v13.2d,v7.d[0] \n\t" // Scale by beta " \n\t" -" .DBETAZEROCOLSTOREDS4: \n\t" +LABEL(DBETAZEROCOLSTOREDS4) " \n\t" " prfm pldl2keep,[x3] \n\t" " prfm pldl2keep,[x4] \n\t" @@ -1788,9 +1789,9 @@ __asm__ volatile " str q12, [x26, #16] \n\t" " str q13, [x26, #32] \n\t" " \n\t" -" b .DEND \n\t" +BRANCH(DEND) " \n\t" -" .DGENSTORED: \n\t" // C is general-stride stored. +LABEL(DGENSTORED) // C is general-stride stored. " \n\t" " dup v0.2d, xzr \n\t" " dup v1.2d, xzr \n\t" @@ -1800,7 +1801,7 @@ __asm__ volatile " dup v5.2d, xzr \n\t" " \n\t" " fcmp d7,#0.0 \n\t" -" beq .DBETAZEROGENSTOREDS1 \n\t" // Taking care of the beta==0 case. +BEQ(DBETAZEROGENSTOREDS1) // Taking care of the beta==0 case. " \n\t" " mov x27, x2 \n\t" " \n\t" // Load address of C. @@ -1827,7 +1828,7 @@ __asm__ volatile " fmul v4.2d,v4.2d,v7.d[0] \n\t" // Scale by beta " fmul v5.2d,v5.2d,v7.d[0] \n\t" // Scale by beta " \n\t" -" .DBETAZEROGENSTOREDS1: \n\t" +LABEL(DBETAZEROGENSTOREDS1) " \n\t" " fmla v0.2d,v8.2d,v6.d[0] \n\t" // Scale by alpha " fmla v1.2d,v9.2d,v6.d[0] \n\t" // Scale by alpha @@ -1862,7 +1863,7 @@ __asm__ volatile " dup v13.2d, xzr \n\t" " \n\t" " fcmp d7,#0.0 \n\t" -" beq .DBETAZEROGENSTOREDS2 \n\t" // Taking care of the beta==0 case. +BEQ(DBETAZEROGENSTOREDS2) // Taking care of the beta==0 case. " \n\t" " mov x27, x21 \n\t" // Load address of C. " \n\t" @@ -1889,7 +1890,7 @@ __asm__ volatile " fmul v12.2d,v12.2d,v7.d[0] \n\t" // Scale by beta " fmul v13.2d,v13.2d,v7.d[0] \n\t" // Scale by beta " \n\t" -" .DBETAZEROGENSTOREDS2: \n\t" +LABEL(DBETAZEROGENSTOREDS2) " \n\t" " fmla v8.2d, v14.2d,v6.d[0] \n\t" // Scale by alpha " fmla v9.2d, v15.2d,v6.d[0] \n\t" // Scale by alpha @@ -1924,7 +1925,7 @@ __asm__ volatile " dup v5.2d, xzr \n\t" " \n\t" " fcmp d7,#0.0 \n\t" -" beq .DBETAZEROGENSTOREDS3 \n\t" // Taking care of the beta==0 case. +BEQ(DBETAZEROGENSTOREDS3) // Taking care of the beta==0 case. " \n\t" " mov x27, x23 \n\t" // Load address of C. " \n\t" @@ -1951,7 +1952,7 @@ __asm__ volatile " fmul v4.2d,v4.2d,v7.d[0] \n\t" // Scale by beta " fmul v5.2d,v5.2d,v7.d[0] \n\t" // Scale by beta " \n\t" -" .DBETAZEROGENSTOREDS3: \n\t" +LABEL(DBETAZEROGENSTOREDS3) " \n\t" " fmla v0.2d,v20.2d,v6.d[0] \n\t" // Scale by alpha " fmla v1.2d,v21.2d,v6.d[0] \n\t" // Scale by alpha @@ -1986,7 +1987,7 @@ __asm__ volatile " dup v13.2d, xzr \n\t" " \n\t" " fcmp d7,#0.0 \n\t" -" beq .DBETAZEROGENSTOREDS4 \n\t" // Taking care of the beta==0 case. +BEQ(DBETAZEROGENSTOREDS4) // Taking care of the beta==0 case. " \n\t" " mov x27, x25 \n\t" " \n\t" @@ -2013,7 +2014,7 @@ __asm__ volatile " fmul v12.2d,v12.2d,v7.d[0] \n\t" // Scale by beta " fmul v13.2d,v13.2d,v7.d[0] \n\t" // Scale by beta " \n\t" -" .DBETAZEROGENSTOREDS4: \n\t" +LABEL(DBETAZEROGENSTOREDS4) " \n\t" " prfm pldl2keep,[x3] \n\t" " prfm pldl2keep,[x4] \n\t" @@ -2043,7 +2044,7 @@ __asm__ volatile " st1 {v13.d}[0],[x27],x14 \n\t" // Store c74 into quad and increment by rs_c. " st1 {v13.d}[1],[x27],x14 \n\t" // Store c75 into quad and increment by rs_c. " \n\t" -" .DEND: \n\t" // Done! +LABEL(DEND) // Done! " \n\t" :// output operands (none) :// input operands