Asm Flag Mingling for Darwin_Aarch64

Apple+Arm64 requires additional "tagging" of local symbols.
This commit is contained in:
RuQing Xu
2021-05-29 16:28:03 +09:00
parent 82af05f54c
commit 7fabd896af
2 changed files with 112 additions and 62 deletions

View File

@@ -0,0 +1,49 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2021, The University of Tokyo
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
// Apple's local label requirements.
#if defined(__APPLE__)
#define LABEL(str) " L" #str": \n\t"
#define BEQ(str) "b.eq L" #str" \n\t"
#define BNE(str) "b.ne L" #str" \n\t"
#define BRANCH(str) "b L" #str" \n\t"
#else
#define LABEL(str) " ." #str": \n\t"
#define BEQ(str) "b.eq ." #str" \n\t"
#define BNE(str) "b.ne ." #str" \n\t"
#define BRANCH(str) "b ." #str" \n\t"
#endif

View File

@@ -34,6 +34,7 @@
*/
#include "blis.h"
#include "armv8a_asm_utils.h"
/*
o 4x4 Single precision micro-kernel fully functional.
@@ -155,7 +156,7 @@ __asm__ volatile
" dup v31.4s, wzr \n\t" // Vector for accummulating column 11
" \n\t"
" cmp x5,#0 \n\t" // If k_iter == 0, jump to k_left.
" beq .SCONSIDERKLEFT \n\t"
BEQ(SCONSIDERKLEFT)
" \n\t"
" ldr q0, [x0] \n\t"
" ldr q1, [x0, #16] \n\t" // Load a
@@ -168,9 +169,9 @@ __asm__ volatile
" add x1, x1, #48 \n\t" //update address of B
" \n\t"
" cmp x5,1 \n\t" // If there is just one k_iter, jump to that one.
" beq .SLASTITER \n\t" // (as loop is do-while-like).
BEQ(SLASTITER) // (as loop is do-while-like).
" \n\t"
" .SLOOPKITER: \n\t" // Body of the k_iter loop.
LABEL(SLOOPKITER) // Body of the k_iter loop.
" \n\t"
" ldr q5, [x0] \n\t"
" fmla v8.4s, v0.4s,v2.s[0] \n\t" // Accummulate.
@@ -316,9 +317,9 @@ __asm__ volatile
" \n\t" //End It 4
" sub x5,x5,1 \n\t" // i-=1.
" cmp x5,1 \n\t" // Iterate again if we are not in k_iter == 1.
" bne .SLOOPKITER \n\t"
BNE(SLOOPKITER)
" \n\t"
" .SLASTITER: \n\t" // Last iteration of k_iter loop.
LABEL(SLASTITER) // Last iteration of k_iter loop.
" \n\t"
" \n\t"
" ldr q5, [x0] \n\t"
@@ -454,11 +455,11 @@ __asm__ volatile
" add x0, x0, #96 \n\t"
" \n\t" //End It 4
" \n\t"
" .SCONSIDERKLEFT: \n\t"
LABEL(SCONSIDERKLEFT)
" cmp x6,0 \n\t" // If k_left == 0, we are done.
" beq .SPOSTACCUM \n\t" // else, we enter the k_left loop.
BEQ(SPOSTACCUM) // else, we enter the k_left loop.
" \n\t"
" .SLOOPKLEFT: \n\t" // Body of the left iterations
LABEL(SLOOPKLEFT) // Body of the left iterations
" \n\t"
" ldr q0, [x0],#16 \n\t"
" ldr q1, [x0],#16 \n\t" // Load a
@@ -497,17 +498,17 @@ __asm__ volatile
" fmla v31.4s,v1.4s,v4.s[3] \n\t" // Accummulate.
" \n\t"
" cmp x6,0 \n\t" // Iterate again.
" bne .SLOOPKLEFT \n\t" // if i!=0.
BNE(SLOOPKLEFT) // if i!=0.
" \n\t"
" .SPOSTACCUM: \n\t"
LABEL(SPOSTACCUM)
" \n\t"
" ld1r {v6.4s},[x7] \n\t" // Load alpha.
" ld1r {v7.4s},[x8] \n\t" // Load beta
" \n\t"
" cmp x13,#1 \n\t" // If rs_c != 1 (column-major)
" bne .SGENSTORED \n\t"
BNE(SGENSTORED)
" \n\t"
" .SCOLSTORED: \n\t" // C is column-major.
LABEL(SCOLSTORED) // C is column-major.
" \n\t"
" dup v0.4s, wzr \n\t"
" dup v1.4s, wzr \n\t"
@@ -517,7 +518,7 @@ __asm__ volatile
" dup v5.4s, wzr \n\t"
" \n\t"
" fcmp s7,#0.0 \n\t"
" beq .SBETAZEROCOLSTOREDS1 \n\t" // Taking care of the beta==0 case.
BEQ(SBETAZEROCOLSTOREDS1) // Taking care of the beta==0 case.
" \n\t"
" ldr q0, [x2] \n\t" //Load column 0 of C
" ldr q1, [x2, #16] \n\t"
@@ -533,7 +534,7 @@ __asm__ volatile
" fmul v4.4s,v4.4s,v7.s[0] \n\t" // Scale by beta
" fmul v5.4s,v5.4s,v7.s[0] \n\t" // Scale by beta
" \n\t"
" .SBETAZEROCOLSTOREDS1: \n\t"
LABEL(SBETAZEROCOLSTOREDS1)
" \n\t"
" fmla v0.4s,v8.4s,v6.s[0] \n\t" // Scale by alpha
" fmla v1.4s,v9.4s,v6.s[0] \n\t" // Scale by alpha
@@ -557,7 +558,7 @@ __asm__ volatile
" dup v13.4s, wzr \n\t"
" \n\t"
" fcmp s7,#0.0 \n\t"
" beq .SBETAZEROCOLSTOREDS2 \n\t" // Taking care of the beta==0 case.
BEQ(SBETAZEROCOLSTOREDS2) // Taking care of the beta==0 case.
" \n\t"
" ldr q8, [x18] \n\t" //Load column 3 of C
" ldr q9, [x18, #16] \n\t"
@@ -573,7 +574,7 @@ __asm__ volatile
" fmul v12.4s,v12.4s,v7.s[0] \n\t" // Scale by beta
" fmul v13.4s,v13.4s,v7.s[0] \n\t" // Scale by beta
" \n\t"
" .SBETAZEROCOLSTOREDS2: \n\t"
LABEL(SBETAZEROCOLSTOREDS2)
" \n\t"
" fmla v8.4s, v14.4s,v6.s[0] \n\t" // Scale by alpha
" fmla v9.4s, v15.4s,v6.s[0] \n\t" // Scale by alpha
@@ -597,7 +598,7 @@ __asm__ volatile
" dup v5.4s, wzr \n\t"
" \n\t"
" fcmp s7,#0.0 \n\t"
" beq .SBETAZEROCOLSTOREDS3 \n\t" // Taking care of the beta==0 case.
BEQ(SBETAZEROCOLSTOREDS3) // Taking care of the beta==0 case.
" \n\t"
" ldr q0, [x21] \n\t" //Load column 6 of C
" ldr q1, [x21, #16] \n\t"
@@ -613,7 +614,7 @@ __asm__ volatile
" fmul v4.4s,v4.4s,v7.s[0] \n\t" // Scale by beta
" fmul v5.4s,v5.4s,v7.s[0] \n\t" // Scale by beta
" \n\t"
" .SBETAZEROCOLSTOREDS3: \n\t"
LABEL(SBETAZEROCOLSTOREDS3)
" \n\t"
" fmla v0.4s,v20.4s,v6.s[0] \n\t" // Scale by alpha
" fmla v1.4s,v21.4s,v6.s[0] \n\t" // Scale by alpha
@@ -637,7 +638,7 @@ __asm__ volatile
" dup v13.4s, wzr \n\t"
" \n\t"
" fcmp s7,#0.0 \n\t"
" beq .SBETAZEROCOLSTOREDS4 \n\t" // Taking care of the beta==0 case.
BEQ(SBETAZEROCOLSTOREDS4) // Taking care of the beta==0 case.
" \n\t"
" ldr q8, [x24] \n\t" //Load column 9 of C
" ldr q9, [x24, #16] \n\t"
@@ -653,7 +654,7 @@ __asm__ volatile
" fmul v12.4s,v12.4s,v7.s[0] \n\t" // Scale by beta
" fmul v13.4s,v13.4s,v7.s[0] \n\t" // Scale by beta
" \n\t"
" .SBETAZEROCOLSTOREDS4: \n\t"
LABEL(SBETAZEROCOLSTOREDS4)
" \n\t"
" prfm pldl2keep,[x3] \n\t"
" prfm pldl2keep,[x4] \n\t"
@@ -673,10 +674,10 @@ __asm__ volatile
" str q13, [x26, #16] \n\t"
" \n\t"
" \n\t"
" b .SEND \n\t" // Done (TODO: this obviously needs to be moved down to remove jump).
BRANCH(SEND) // Done.
" \n\t"
" \n\t"
" .SGENSTORED: \n\t" // C is general-stride stored.
LABEL(SGENSTORED) // C is general-stride stored.
" \n\t"
" \n\t"
" dup v0.4s, wzr \n\t"
@@ -687,7 +688,7 @@ __asm__ volatile
" dup v5.4s, wzr \n\t"
" \n\t"
" fcmp s7,#0.0 \n\t"
" beq .SBETAZEROGENSTOREDS1 \n\t" // Taking care of the beta==0 case.
BEQ(SBETAZEROGENSTOREDS1) // Taking care of the beta==0 case.
" \n\t"
" mov x27, x2 \n\t"
" \n\t"
@@ -729,7 +730,7 @@ __asm__ volatile
" fmul v4.4s,v4.4s,v7.s[0] \n\t" // Scale by beta
" fmul v5.4s,v5.4s,v7.s[0] \n\t" // Scale by beta
" \n\t"
" .SBETAZEROGENSTOREDS1: \n\t"
LABEL(SBETAZEROGENSTOREDS1)
" \n\t"
" fmla v0.4s, v8.4s,v6.s[0] \n\t" // Scale by alpha
" fmla v1.4s, v9.4s,v6.s[0] \n\t" // Scale by alpha
@@ -779,7 +780,7 @@ __asm__ volatile
" dup v13.4s, wzr \n\t"
" \n\t"
" fcmp s7,#0.0 \n\t"
" beq .SBETAZEROGENSTOREDS2 \n\t" // Taking care of the beta==0 case.
BEQ(SBETAZEROGENSTOREDS2) // Taking care of the beta==0 case.
" \n\t"
" mov x27, x18 \n\t"
" \n\t"
@@ -821,7 +822,7 @@ __asm__ volatile
" fmul v12.4s,v12.4s,v7.s[0] \n\t" // Scale by beta
" fmul v13.4s,v13.4s,v7.s[0] \n\t" // Scale by beta
" \n\t"
" .SBETAZEROGENSTOREDS2: \n\t"
LABEL(SBETAZEROGENSTOREDS2)
" \n\t"
" fmla v8.4s, v14.4s,v6.s[0] \n\t" // Scale by alpha
" fmla v9.4s, v15.4s,v6.s[0] \n\t" // Scale by alpha
@@ -871,7 +872,7 @@ __asm__ volatile
" dup v5.4s, wzr \n\t"
" \n\t"
" fcmp s7,#0.0 \n\t"
" beq .SBETAZEROGENSTOREDS3 \n\t" // Taking care of the beta==0 case.
BEQ(SBETAZEROGENSTOREDS3) // Taking care of the beta==0 case.
" \n\t"
" mov x27, x21 \n\t"
" \n\t"
@@ -913,7 +914,7 @@ __asm__ volatile
" fmul v4.4s,v4.4s,v7.s[0] \n\t" // Scale by beta
" fmul v5.4s,v5.4s,v7.s[0] \n\t" // Scale by beta
" \n\t"
" .SBETAZEROGENSTOREDS3: \n\t"
LABEL(SBETAZEROGENSTOREDS3)
" \n\t"
" fmla v0.4s,v20.4s,v6.s[0] \n\t" // Scale by alpha
" fmla v1.4s,v21.4s,v6.s[0] \n\t" // Scale by alpha
@@ -963,7 +964,7 @@ __asm__ volatile
" dup v13.4s, wzr \n\t"
" \n\t"
" fcmp s7,#0.0 \n\t"
" beq .SBETAZEROGENSTOREDS4 \n\t" // Taking care of the beta==0 case.
BEQ(SBETAZEROGENSTOREDS4) // Taking care of the beta==0 case.
" \n\t"
" mov x27, x24 \n\t"
" \n\t"
@@ -1005,7 +1006,7 @@ __asm__ volatile
" fmul v12.4s,v12.4s,v7.s[0] \n\t" // Scale by beta
" fmul v13.4s,v13.4s,v7.s[0] \n\t" // Scale by beta
" \n\t"
" .SBETAZEROGENSTOREDS4: \n\t"
LABEL(SBETAZEROGENSTOREDS4)
" \n\t"
" prfm pldl2keep,[x3] \n\t"
" prfm pldl2keep,[x4] \n\t"
@@ -1050,7 +1051,7 @@ __asm__ volatile
" st1 {v13.s}[2],[x27],x14 \n\t" // Store c116 into quad and increment by rs_c.
" st1 {v13.s}[3],[x27],x14 \n\t" // Store c147 into quad and increment by rs_c.
" \n\t"
" .SEND: \n\t" // Done!
LABEL(SEND) // Done!
" \n\t"
:// output operands (none)
:// input operands
@@ -1203,7 +1204,7 @@ __asm__ volatile
" \n\t"
" \n\t"
" cmp x5,#0 \n\t" // If k_iter == 0, jump to k_left.
" beq .DCONSIDERKLEFT \n\t"
BEQ(DCONSIDERKLEFT)
" \n\t"
" ldr q0, [x0] \n\t" // Load a
" ldr q1, [x0, #16] \n\t"
@@ -1218,9 +1219,9 @@ __asm__ volatile
" add x1, x1, #64 \n\t" //update address of B
" \n\t"
" cmp x5,1 \n\t" // If there is just one k_iter, jump to that one.
" beq .DLASTITER \n\t" // (as loop is do-while-like).
BEQ(DLASTITER) // (as loop is do-while-like).
" \n\t"
" DLOOP: \n\t" // Body
LABEL(DLOOP) // Body
" \n\t"
" fmla v8.2d ,v0.2d,v3.d[0] \n\t" // Accummulate
" prfm PLDL1KEEP, [x1, #448] \n\t" //512-64=448
@@ -1394,9 +1395,9 @@ __asm__ volatile
" \n\t"
" sub x5,x5,1 \n\t" // i-=1
" cmp x5,1 \n\t" // Iterate again if we are not in k_iter == 1.
" bne DLOOP \n\t"
BNE(DLOOP)
" \n\t"
".DLASTITER: \n\t"
LABEL(DLASTITER)
" \n\t"
" fmla v8.2d ,v0.2d,v3.d[0] \n\t" // Accummulate
" fmla v9.2d ,v1.2d,v3.d[0] \n\t" // Accummulate
@@ -1554,11 +1555,11 @@ __asm__ volatile
" \n\t" //End it 4
" add x0, x0, #144 \n\t"
" \n\t"
" .DCONSIDERKLEFT: \n\t"
LABEL(DCONSIDERKLEFT)
" cmp x6,0 \n\t" // If k_left == 0, we are done.
" beq .DPOSTACCUM \n\t" // else, we enter the k_left loop.
BEQ(DPOSTACCUM) // else, we enter the k_left loop.
" \n\t"
".DLOOPKLEFT: \n\t"
LABEL(DLOOPKLEFT)
" \n\t"
" ldr q0, [x0],#16 \n\t"
" ldr q1, [x0],#16 \n\t" // Load a
@@ -1605,17 +1606,17 @@ __asm__ volatile
" fmla v31.2d,v2.2d,v6.d[1] \n\t" // Accummulate
" \n\t"
" cmp x6,0 \n\t" // Iterate again.
" bne .DLOOPKLEFT \n\t" // if i!=0.
BNE(DLOOPKLEFT) // if i!=0.
" \n\t"
" .DPOSTACCUM: \n\t"
LABEL(DPOSTACCUM)
" \n\t"
" ld1r {v6.2d},[x7] \n\t" // Load alpha.
" ld1r {v7.2d},[x8] \n\t" // Load beta
" \n\t"
" cmp x13,#1 \n\t" // If rs_c != 1 (column-major)
" bne .DGENSTORED \n\t"
BNE(DGENSTORED)
" \n\t"
" .DCOLSTORED: \n\t" // C is column-major.
LABEL(DCOLSTORED) // C is column-major.
" \n\t"
" dup v0.2d, xzr \n\t"
" dup v1.2d, xzr \n\t"
@@ -1625,7 +1626,7 @@ __asm__ volatile
" dup v5.2d, xzr \n\t"
" \n\t"
" fcmp d7,#0.0 \n\t"
" beq .DBETAZEROCOLSTOREDS1 \n\t" // Taking care of the beta==0 case.
BEQ(DBETAZEROCOLSTOREDS1) // Taking care of the beta==0 case.
" \n\t"
" ldr q0, [x2] \n\t" //Load column 0 of C
" ldr q1, [x2, #16] \n\t"
@@ -1642,7 +1643,7 @@ __asm__ volatile
" fmul v4.2d,v4.2d,v7.d[0] \n\t" // Scale by beta
" fmul v5.2d,v5.2d,v7.d[0] \n\t" // Scale by beta
" \n\t"
" .DBETAZEROCOLSTOREDS1: \n\t"
LABEL(DBETAZEROCOLSTOREDS1)
" \n\t"
" fmla v0.2d,v8.2d,v6.d[0] \n\t" // Scale by alpha
" fmla v1.2d,v9.2d,v6.d[0] \n\t" // Scale by alpha
@@ -1667,7 +1668,7 @@ __asm__ volatile
" dup v13.2d, xzr \n\t"
" \n\t"
" fcmp d7,#0.0 \n\t"
" beq .DBETAZEROCOLSTOREDS2 \n\t" // Taking care of the beta==0 case.
BEQ(DBETAZEROCOLSTOREDS2) // Taking care of the beta==0 case.
" \n\t"
" ldr q8, [x21] \n\t" //Load column 2 of C
" ldr q9, [x21, #16] \n\t"
@@ -1684,7 +1685,7 @@ __asm__ volatile
" fmul v12.2d,v12.2d,v7.d[0] \n\t" // Scale by beta
" fmul v13.2d,v13.2d,v7.d[0] \n\t" // Scale by beta
" \n\t"
" .DBETAZEROCOLSTOREDS2: \n\t"
LABEL(DBETAZEROCOLSTOREDS2)
" \n\t"
" fmla v8.2d, v14.2d,v6.d[0] \n\t" // Scale by alpha
" fmla v9.2d, v15.2d,v6.d[0] \n\t" // Scale by alpha
@@ -1709,7 +1710,7 @@ __asm__ volatile
" dup v5.2d, xzr \n\t"
" \n\t"
" fcmp d7,#0.0 \n\t"
" beq .DBETAZEROCOLSTOREDS3 \n\t" // Taking care of the beta==0 case.
BEQ(DBETAZEROCOLSTOREDS3) // Taking care of the beta==0 case.
" \n\t"
" ldr q0, [x23] \n\t" //Load column 4 of C
" ldr q1, [x23, #16] \n\t"
@@ -1726,7 +1727,7 @@ __asm__ volatile
" fmul v4.2d,v4.2d,v7.d[0] \n\t" // Scale by beta
" fmul v5.2d,v5.2d,v7.d[0] \n\t" // Scale by beta
" \n\t"
" .DBETAZEROCOLSTOREDS3: \n\t"
LABEL(DBETAZEROCOLSTOREDS3)
" \n\t"
" fmla v0.2d,v20.2d,v6.d[0] \n\t" // Scale by alpha
" fmla v1.2d,v21.2d,v6.d[0] \n\t" // Scale by alpha
@@ -1751,7 +1752,7 @@ __asm__ volatile
" dup v13.2d, xzr \n\t"
" \n\t"
" fcmp d7,#0.0 \n\t"
" beq .DBETAZEROCOLSTOREDS4 \n\t" // Taking care of the beta==0 case.
BEQ(DBETAZEROCOLSTOREDS4) // Taking care of the beta==0 case.
" \n\t"
" ldr q8, [x25] \n\t" //Load column 6 of C
" ldr q9, [x25, #16] \n\t"
@@ -1768,7 +1769,7 @@ __asm__ volatile
" fmul v12.2d,v12.2d,v7.d[0] \n\t" // Scale by beta
" fmul v13.2d,v13.2d,v7.d[0] \n\t" // Scale by beta
" \n\t"
" .DBETAZEROCOLSTOREDS4: \n\t"
LABEL(DBETAZEROCOLSTOREDS4)
" \n\t"
" prfm pldl2keep,[x3] \n\t"
" prfm pldl2keep,[x4] \n\t"
@@ -1788,9 +1789,9 @@ __asm__ volatile
" str q12, [x26, #16] \n\t"
" str q13, [x26, #32] \n\t"
" \n\t"
" b .DEND \n\t"
BRANCH(DEND)
" \n\t"
" .DGENSTORED: \n\t" // C is general-stride stored.
LABEL(DGENSTORED) // C is general-stride stored.
" \n\t"
" dup v0.2d, xzr \n\t"
" dup v1.2d, xzr \n\t"
@@ -1800,7 +1801,7 @@ __asm__ volatile
" dup v5.2d, xzr \n\t"
" \n\t"
" fcmp d7,#0.0 \n\t"
" beq .DBETAZEROGENSTOREDS1 \n\t" // Taking care of the beta==0 case.
BEQ(DBETAZEROGENSTOREDS1) // Taking care of the beta==0 case.
" \n\t"
" mov x27, x2 \n\t"
" \n\t" // Load address of C.
@@ -1827,7 +1828,7 @@ __asm__ volatile
" fmul v4.2d,v4.2d,v7.d[0] \n\t" // Scale by beta
" fmul v5.2d,v5.2d,v7.d[0] \n\t" // Scale by beta
" \n\t"
" .DBETAZEROGENSTOREDS1: \n\t"
LABEL(DBETAZEROGENSTOREDS1)
" \n\t"
" fmla v0.2d,v8.2d,v6.d[0] \n\t" // Scale by alpha
" fmla v1.2d,v9.2d,v6.d[0] \n\t" // Scale by alpha
@@ -1862,7 +1863,7 @@ __asm__ volatile
" dup v13.2d, xzr \n\t"
" \n\t"
" fcmp d7,#0.0 \n\t"
" beq .DBETAZEROGENSTOREDS2 \n\t" // Taking care of the beta==0 case.
BEQ(DBETAZEROGENSTOREDS2) // Taking care of the beta==0 case.
" \n\t"
" mov x27, x21 \n\t" // Load address of C.
" \n\t"
@@ -1889,7 +1890,7 @@ __asm__ volatile
" fmul v12.2d,v12.2d,v7.d[0] \n\t" // Scale by beta
" fmul v13.2d,v13.2d,v7.d[0] \n\t" // Scale by beta
" \n\t"
" .DBETAZEROGENSTOREDS2: \n\t"
LABEL(DBETAZEROGENSTOREDS2)
" \n\t"
" fmla v8.2d, v14.2d,v6.d[0] \n\t" // Scale by alpha
" fmla v9.2d, v15.2d,v6.d[0] \n\t" // Scale by alpha
@@ -1924,7 +1925,7 @@ __asm__ volatile
" dup v5.2d, xzr \n\t"
" \n\t"
" fcmp d7,#0.0 \n\t"
" beq .DBETAZEROGENSTOREDS3 \n\t" // Taking care of the beta==0 case.
BEQ(DBETAZEROGENSTOREDS3) // Taking care of the beta==0 case.
" \n\t"
" mov x27, x23 \n\t" // Load address of C.
" \n\t"
@@ -1951,7 +1952,7 @@ __asm__ volatile
" fmul v4.2d,v4.2d,v7.d[0] \n\t" // Scale by beta
" fmul v5.2d,v5.2d,v7.d[0] \n\t" // Scale by beta
" \n\t"
" .DBETAZEROGENSTOREDS3: \n\t"
LABEL(DBETAZEROGENSTOREDS3)
" \n\t"
" fmla v0.2d,v20.2d,v6.d[0] \n\t" // Scale by alpha
" fmla v1.2d,v21.2d,v6.d[0] \n\t" // Scale by alpha
@@ -1986,7 +1987,7 @@ __asm__ volatile
" dup v13.2d, xzr \n\t"
" \n\t"
" fcmp d7,#0.0 \n\t"
" beq .DBETAZEROGENSTOREDS4 \n\t" // Taking care of the beta==0 case.
BEQ(DBETAZEROGENSTOREDS4) // Taking care of the beta==0 case.
" \n\t"
" mov x27, x25 \n\t"
" \n\t"
@@ -2013,7 +2014,7 @@ __asm__ volatile
" fmul v12.2d,v12.2d,v7.d[0] \n\t" // Scale by beta
" fmul v13.2d,v13.2d,v7.d[0] \n\t" // Scale by beta
" \n\t"
" .DBETAZEROGENSTOREDS4: \n\t"
LABEL(DBETAZEROGENSTOREDS4)
" \n\t"
" prfm pldl2keep,[x3] \n\t"
" prfm pldl2keep,[x4] \n\t"
@@ -2043,7 +2044,7 @@ __asm__ volatile
" st1 {v13.d}[0],[x27],x14 \n\t" // Store c74 into quad and increment by rs_c.
" st1 {v13.d}[1],[x27],x14 \n\t" // Store c75 into quad and increment by rs_c.
" \n\t"
" .DEND: \n\t" // Done!
LABEL(DEND) // Done!
" \n\t"
:// output operands (none)
:// input operands