mirror of
https://github.com/amd/blis.git
synced 2026-05-12 18:15:37 +00:00
Asm Flag Mingling for Darwin_Aarch64
Apple+Arm64 requires additional "tagging" of local symbols.
This commit is contained in:
49
kernels/armv8a/3/armv8a_asm_utils.h
Normal file
49
kernels/armv8a/3/armv8a_asm_utils.h
Normal file
@@ -0,0 +1,49 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2021, The University of Tokyo
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
|
||||
*/
|
||||
|
||||
// Apple's local label requirements.
|
||||
#if defined(__APPLE__)
|
||||
#define LABEL(str) " L" #str": \n\t"
|
||||
#define BEQ(str) "b.eq L" #str" \n\t"
|
||||
#define BNE(str) "b.ne L" #str" \n\t"
|
||||
#define BRANCH(str) "b L" #str" \n\t"
|
||||
#else
|
||||
#define LABEL(str) " ." #str": \n\t"
|
||||
#define BEQ(str) "b.eq ." #str" \n\t"
|
||||
#define BNE(str) "b.ne ." #str" \n\t"
|
||||
#define BRANCH(str) "b ." #str" \n\t"
|
||||
#endif
|
||||
|
||||
@@ -34,6 +34,7 @@
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
#include "armv8a_asm_utils.h"
|
||||
|
||||
/*
|
||||
o 4x4 Single precision micro-kernel fully functional.
|
||||
@@ -155,7 +156,7 @@ __asm__ volatile
|
||||
" dup v31.4s, wzr \n\t" // Vector for accummulating column 11
|
||||
" \n\t"
|
||||
" cmp x5,#0 \n\t" // If k_iter == 0, jump to k_left.
|
||||
" beq .SCONSIDERKLEFT \n\t"
|
||||
BEQ(SCONSIDERKLEFT)
|
||||
" \n\t"
|
||||
" ldr q0, [x0] \n\t"
|
||||
" ldr q1, [x0, #16] \n\t" // Load a
|
||||
@@ -168,9 +169,9 @@ __asm__ volatile
|
||||
" add x1, x1, #48 \n\t" //update address of B
|
||||
" \n\t"
|
||||
" cmp x5,1 \n\t" // If there is just one k_iter, jump to that one.
|
||||
" beq .SLASTITER \n\t" // (as loop is do-while-like).
|
||||
BEQ(SLASTITER) // (as loop is do-while-like).
|
||||
" \n\t"
|
||||
" .SLOOPKITER: \n\t" // Body of the k_iter loop.
|
||||
LABEL(SLOOPKITER) // Body of the k_iter loop.
|
||||
" \n\t"
|
||||
" ldr q5, [x0] \n\t"
|
||||
" fmla v8.4s, v0.4s,v2.s[0] \n\t" // Accummulate.
|
||||
@@ -316,9 +317,9 @@ __asm__ volatile
|
||||
" \n\t" //End It 4
|
||||
" sub x5,x5,1 \n\t" // i-=1.
|
||||
" cmp x5,1 \n\t" // Iterate again if we are not in k_iter == 1.
|
||||
" bne .SLOOPKITER \n\t"
|
||||
BNE(SLOOPKITER)
|
||||
" \n\t"
|
||||
" .SLASTITER: \n\t" // Last iteration of k_iter loop.
|
||||
LABEL(SLASTITER) // Last iteration of k_iter loop.
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
" ldr q5, [x0] \n\t"
|
||||
@@ -454,11 +455,11 @@ __asm__ volatile
|
||||
" add x0, x0, #96 \n\t"
|
||||
" \n\t" //End It 4
|
||||
" \n\t"
|
||||
" .SCONSIDERKLEFT: \n\t"
|
||||
LABEL(SCONSIDERKLEFT)
|
||||
" cmp x6,0 \n\t" // If k_left == 0, we are done.
|
||||
" beq .SPOSTACCUM \n\t" // else, we enter the k_left loop.
|
||||
BEQ(SPOSTACCUM) // else, we enter the k_left loop.
|
||||
" \n\t"
|
||||
" .SLOOPKLEFT: \n\t" // Body of the left iterations
|
||||
LABEL(SLOOPKLEFT) // Body of the left iterations
|
||||
" \n\t"
|
||||
" ldr q0, [x0],#16 \n\t"
|
||||
" ldr q1, [x0],#16 \n\t" // Load a
|
||||
@@ -497,17 +498,17 @@ __asm__ volatile
|
||||
" fmla v31.4s,v1.4s,v4.s[3] \n\t" // Accummulate.
|
||||
" \n\t"
|
||||
" cmp x6,0 \n\t" // Iterate again.
|
||||
" bne .SLOOPKLEFT \n\t" // if i!=0.
|
||||
BNE(SLOOPKLEFT) // if i!=0.
|
||||
" \n\t"
|
||||
" .SPOSTACCUM: \n\t"
|
||||
LABEL(SPOSTACCUM)
|
||||
" \n\t"
|
||||
" ld1r {v6.4s},[x7] \n\t" // Load alpha.
|
||||
" ld1r {v7.4s},[x8] \n\t" // Load beta
|
||||
" \n\t"
|
||||
" cmp x13,#1 \n\t" // If rs_c != 1 (column-major)
|
||||
" bne .SGENSTORED \n\t"
|
||||
BNE(SGENSTORED)
|
||||
" \n\t"
|
||||
" .SCOLSTORED: \n\t" // C is column-major.
|
||||
LABEL(SCOLSTORED) // C is column-major.
|
||||
" \n\t"
|
||||
" dup v0.4s, wzr \n\t"
|
||||
" dup v1.4s, wzr \n\t"
|
||||
@@ -517,7 +518,7 @@ __asm__ volatile
|
||||
" dup v5.4s, wzr \n\t"
|
||||
" \n\t"
|
||||
" fcmp s7,#0.0 \n\t"
|
||||
" beq .SBETAZEROCOLSTOREDS1 \n\t" // Taking care of the beta==0 case.
|
||||
BEQ(SBETAZEROCOLSTOREDS1) // Taking care of the beta==0 case.
|
||||
" \n\t"
|
||||
" ldr q0, [x2] \n\t" //Load column 0 of C
|
||||
" ldr q1, [x2, #16] \n\t"
|
||||
@@ -533,7 +534,7 @@ __asm__ volatile
|
||||
" fmul v4.4s,v4.4s,v7.s[0] \n\t" // Scale by beta
|
||||
" fmul v5.4s,v5.4s,v7.s[0] \n\t" // Scale by beta
|
||||
" \n\t"
|
||||
" .SBETAZEROCOLSTOREDS1: \n\t"
|
||||
LABEL(SBETAZEROCOLSTOREDS1)
|
||||
" \n\t"
|
||||
" fmla v0.4s,v8.4s,v6.s[0] \n\t" // Scale by alpha
|
||||
" fmla v1.4s,v9.4s,v6.s[0] \n\t" // Scale by alpha
|
||||
@@ -557,7 +558,7 @@ __asm__ volatile
|
||||
" dup v13.4s, wzr \n\t"
|
||||
" \n\t"
|
||||
" fcmp s7,#0.0 \n\t"
|
||||
" beq .SBETAZEROCOLSTOREDS2 \n\t" // Taking care of the beta==0 case.
|
||||
BEQ(SBETAZEROCOLSTOREDS2) // Taking care of the beta==0 case.
|
||||
" \n\t"
|
||||
" ldr q8, [x18] \n\t" //Load column 3 of C
|
||||
" ldr q9, [x18, #16] \n\t"
|
||||
@@ -573,7 +574,7 @@ __asm__ volatile
|
||||
" fmul v12.4s,v12.4s,v7.s[0] \n\t" // Scale by beta
|
||||
" fmul v13.4s,v13.4s,v7.s[0] \n\t" // Scale by beta
|
||||
" \n\t"
|
||||
" .SBETAZEROCOLSTOREDS2: \n\t"
|
||||
LABEL(SBETAZEROCOLSTOREDS2)
|
||||
" \n\t"
|
||||
" fmla v8.4s, v14.4s,v6.s[0] \n\t" // Scale by alpha
|
||||
" fmla v9.4s, v15.4s,v6.s[0] \n\t" // Scale by alpha
|
||||
@@ -597,7 +598,7 @@ __asm__ volatile
|
||||
" dup v5.4s, wzr \n\t"
|
||||
" \n\t"
|
||||
" fcmp s7,#0.0 \n\t"
|
||||
" beq .SBETAZEROCOLSTOREDS3 \n\t" // Taking care of the beta==0 case.
|
||||
BEQ(SBETAZEROCOLSTOREDS3) // Taking care of the beta==0 case.
|
||||
" \n\t"
|
||||
" ldr q0, [x21] \n\t" //Load column 6 of C
|
||||
" ldr q1, [x21, #16] \n\t"
|
||||
@@ -613,7 +614,7 @@ __asm__ volatile
|
||||
" fmul v4.4s,v4.4s,v7.s[0] \n\t" // Scale by beta
|
||||
" fmul v5.4s,v5.4s,v7.s[0] \n\t" // Scale by beta
|
||||
" \n\t"
|
||||
" .SBETAZEROCOLSTOREDS3: \n\t"
|
||||
LABEL(SBETAZEROCOLSTOREDS3)
|
||||
" \n\t"
|
||||
" fmla v0.4s,v20.4s,v6.s[0] \n\t" // Scale by alpha
|
||||
" fmla v1.4s,v21.4s,v6.s[0] \n\t" // Scale by alpha
|
||||
@@ -637,7 +638,7 @@ __asm__ volatile
|
||||
" dup v13.4s, wzr \n\t"
|
||||
" \n\t"
|
||||
" fcmp s7,#0.0 \n\t"
|
||||
" beq .SBETAZEROCOLSTOREDS4 \n\t" // Taking care of the beta==0 case.
|
||||
BEQ(SBETAZEROCOLSTOREDS4) // Taking care of the beta==0 case.
|
||||
" \n\t"
|
||||
" ldr q8, [x24] \n\t" //Load column 9 of C
|
||||
" ldr q9, [x24, #16] \n\t"
|
||||
@@ -653,7 +654,7 @@ __asm__ volatile
|
||||
" fmul v12.4s,v12.4s,v7.s[0] \n\t" // Scale by beta
|
||||
" fmul v13.4s,v13.4s,v7.s[0] \n\t" // Scale by beta
|
||||
" \n\t"
|
||||
" .SBETAZEROCOLSTOREDS4: \n\t"
|
||||
LABEL(SBETAZEROCOLSTOREDS4)
|
||||
" \n\t"
|
||||
" prfm pldl2keep,[x3] \n\t"
|
||||
" prfm pldl2keep,[x4] \n\t"
|
||||
@@ -673,10 +674,10 @@ __asm__ volatile
|
||||
" str q13, [x26, #16] \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
" b .SEND \n\t" // Done (TODO: this obviously needs to be moved down to remove jump).
|
||||
BRANCH(SEND) // Done.
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
" .SGENSTORED: \n\t" // C is general-stride stored.
|
||||
LABEL(SGENSTORED) // C is general-stride stored.
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
" dup v0.4s, wzr \n\t"
|
||||
@@ -687,7 +688,7 @@ __asm__ volatile
|
||||
" dup v5.4s, wzr \n\t"
|
||||
" \n\t"
|
||||
" fcmp s7,#0.0 \n\t"
|
||||
" beq .SBETAZEROGENSTOREDS1 \n\t" // Taking care of the beta==0 case.
|
||||
BEQ(SBETAZEROGENSTOREDS1) // Taking care of the beta==0 case.
|
||||
" \n\t"
|
||||
" mov x27, x2 \n\t"
|
||||
" \n\t"
|
||||
@@ -729,7 +730,7 @@ __asm__ volatile
|
||||
" fmul v4.4s,v4.4s,v7.s[0] \n\t" // Scale by beta
|
||||
" fmul v5.4s,v5.4s,v7.s[0] \n\t" // Scale by beta
|
||||
" \n\t"
|
||||
" .SBETAZEROGENSTOREDS1: \n\t"
|
||||
LABEL(SBETAZEROGENSTOREDS1)
|
||||
" \n\t"
|
||||
" fmla v0.4s, v8.4s,v6.s[0] \n\t" // Scale by alpha
|
||||
" fmla v1.4s, v9.4s,v6.s[0] \n\t" // Scale by alpha
|
||||
@@ -779,7 +780,7 @@ __asm__ volatile
|
||||
" dup v13.4s, wzr \n\t"
|
||||
" \n\t"
|
||||
" fcmp s7,#0.0 \n\t"
|
||||
" beq .SBETAZEROGENSTOREDS2 \n\t" // Taking care of the beta==0 case.
|
||||
BEQ(SBETAZEROGENSTOREDS2) // Taking care of the beta==0 case.
|
||||
" \n\t"
|
||||
" mov x27, x18 \n\t"
|
||||
" \n\t"
|
||||
@@ -821,7 +822,7 @@ __asm__ volatile
|
||||
" fmul v12.4s,v12.4s,v7.s[0] \n\t" // Scale by beta
|
||||
" fmul v13.4s,v13.4s,v7.s[0] \n\t" // Scale by beta
|
||||
" \n\t"
|
||||
" .SBETAZEROGENSTOREDS2: \n\t"
|
||||
LABEL(SBETAZEROGENSTOREDS2)
|
||||
" \n\t"
|
||||
" fmla v8.4s, v14.4s,v6.s[0] \n\t" // Scale by alpha
|
||||
" fmla v9.4s, v15.4s,v6.s[0] \n\t" // Scale by alpha
|
||||
@@ -871,7 +872,7 @@ __asm__ volatile
|
||||
" dup v5.4s, wzr \n\t"
|
||||
" \n\t"
|
||||
" fcmp s7,#0.0 \n\t"
|
||||
" beq .SBETAZEROGENSTOREDS3 \n\t" // Taking care of the beta==0 case.
|
||||
BEQ(SBETAZEROGENSTOREDS3) // Taking care of the beta==0 case.
|
||||
" \n\t"
|
||||
" mov x27, x21 \n\t"
|
||||
" \n\t"
|
||||
@@ -913,7 +914,7 @@ __asm__ volatile
|
||||
" fmul v4.4s,v4.4s,v7.s[0] \n\t" // Scale by beta
|
||||
" fmul v5.4s,v5.4s,v7.s[0] \n\t" // Scale by beta
|
||||
" \n\t"
|
||||
" .SBETAZEROGENSTOREDS3: \n\t"
|
||||
LABEL(SBETAZEROGENSTOREDS3)
|
||||
" \n\t"
|
||||
" fmla v0.4s,v20.4s,v6.s[0] \n\t" // Scale by alpha
|
||||
" fmla v1.4s,v21.4s,v6.s[0] \n\t" // Scale by alpha
|
||||
@@ -963,7 +964,7 @@ __asm__ volatile
|
||||
" dup v13.4s, wzr \n\t"
|
||||
" \n\t"
|
||||
" fcmp s7,#0.0 \n\t"
|
||||
" beq .SBETAZEROGENSTOREDS4 \n\t" // Taking care of the beta==0 case.
|
||||
BEQ(SBETAZEROGENSTOREDS4) // Taking care of the beta==0 case.
|
||||
" \n\t"
|
||||
" mov x27, x24 \n\t"
|
||||
" \n\t"
|
||||
@@ -1005,7 +1006,7 @@ __asm__ volatile
|
||||
" fmul v12.4s,v12.4s,v7.s[0] \n\t" // Scale by beta
|
||||
" fmul v13.4s,v13.4s,v7.s[0] \n\t" // Scale by beta
|
||||
" \n\t"
|
||||
" .SBETAZEROGENSTOREDS4: \n\t"
|
||||
LABEL(SBETAZEROGENSTOREDS4)
|
||||
" \n\t"
|
||||
" prfm pldl2keep,[x3] \n\t"
|
||||
" prfm pldl2keep,[x4] \n\t"
|
||||
@@ -1050,7 +1051,7 @@ __asm__ volatile
|
||||
" st1 {v13.s}[2],[x27],x14 \n\t" // Store c116 into quad and increment by rs_c.
|
||||
" st1 {v13.s}[3],[x27],x14 \n\t" // Store c147 into quad and increment by rs_c.
|
||||
" \n\t"
|
||||
" .SEND: \n\t" // Done!
|
||||
LABEL(SEND) // Done!
|
||||
" \n\t"
|
||||
:// output operands (none)
|
||||
:// input operands
|
||||
@@ -1203,7 +1204,7 @@ __asm__ volatile
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
" cmp x5,#0 \n\t" // If k_iter == 0, jump to k_left.
|
||||
" beq .DCONSIDERKLEFT \n\t"
|
||||
BEQ(DCONSIDERKLEFT)
|
||||
" \n\t"
|
||||
" ldr q0, [x0] \n\t" // Load a
|
||||
" ldr q1, [x0, #16] \n\t"
|
||||
@@ -1218,9 +1219,9 @@ __asm__ volatile
|
||||
" add x1, x1, #64 \n\t" //update address of B
|
||||
" \n\t"
|
||||
" cmp x5,1 \n\t" // If there is just one k_iter, jump to that one.
|
||||
" beq .DLASTITER \n\t" // (as loop is do-while-like).
|
||||
BEQ(DLASTITER) // (as loop is do-while-like).
|
||||
" \n\t"
|
||||
" DLOOP: \n\t" // Body
|
||||
LABEL(DLOOP) // Body
|
||||
" \n\t"
|
||||
" fmla v8.2d ,v0.2d,v3.d[0] \n\t" // Accummulate
|
||||
" prfm PLDL1KEEP, [x1, #448] \n\t" //512-64=448
|
||||
@@ -1394,9 +1395,9 @@ __asm__ volatile
|
||||
" \n\t"
|
||||
" sub x5,x5,1 \n\t" // i-=1
|
||||
" cmp x5,1 \n\t" // Iterate again if we are not in k_iter == 1.
|
||||
" bne DLOOP \n\t"
|
||||
BNE(DLOOP)
|
||||
" \n\t"
|
||||
".DLASTITER: \n\t"
|
||||
LABEL(DLASTITER)
|
||||
" \n\t"
|
||||
" fmla v8.2d ,v0.2d,v3.d[0] \n\t" // Accummulate
|
||||
" fmla v9.2d ,v1.2d,v3.d[0] \n\t" // Accummulate
|
||||
@@ -1554,11 +1555,11 @@ __asm__ volatile
|
||||
" \n\t" //End it 4
|
||||
" add x0, x0, #144 \n\t"
|
||||
" \n\t"
|
||||
" .DCONSIDERKLEFT: \n\t"
|
||||
LABEL(DCONSIDERKLEFT)
|
||||
" cmp x6,0 \n\t" // If k_left == 0, we are done.
|
||||
" beq .DPOSTACCUM \n\t" // else, we enter the k_left loop.
|
||||
BEQ(DPOSTACCUM) // else, we enter the k_left loop.
|
||||
" \n\t"
|
||||
".DLOOPKLEFT: \n\t"
|
||||
LABEL(DLOOPKLEFT)
|
||||
" \n\t"
|
||||
" ldr q0, [x0],#16 \n\t"
|
||||
" ldr q1, [x0],#16 \n\t" // Load a
|
||||
@@ -1605,17 +1606,17 @@ __asm__ volatile
|
||||
" fmla v31.2d,v2.2d,v6.d[1] \n\t" // Accummulate
|
||||
" \n\t"
|
||||
" cmp x6,0 \n\t" // Iterate again.
|
||||
" bne .DLOOPKLEFT \n\t" // if i!=0.
|
||||
BNE(DLOOPKLEFT) // if i!=0.
|
||||
" \n\t"
|
||||
" .DPOSTACCUM: \n\t"
|
||||
LABEL(DPOSTACCUM)
|
||||
" \n\t"
|
||||
" ld1r {v6.2d},[x7] \n\t" // Load alpha.
|
||||
" ld1r {v7.2d},[x8] \n\t" // Load beta
|
||||
" \n\t"
|
||||
" cmp x13,#1 \n\t" // If rs_c != 1 (column-major)
|
||||
" bne .DGENSTORED \n\t"
|
||||
BNE(DGENSTORED)
|
||||
" \n\t"
|
||||
" .DCOLSTORED: \n\t" // C is column-major.
|
||||
LABEL(DCOLSTORED) // C is column-major.
|
||||
" \n\t"
|
||||
" dup v0.2d, xzr \n\t"
|
||||
" dup v1.2d, xzr \n\t"
|
||||
@@ -1625,7 +1626,7 @@ __asm__ volatile
|
||||
" dup v5.2d, xzr \n\t"
|
||||
" \n\t"
|
||||
" fcmp d7,#0.0 \n\t"
|
||||
" beq .DBETAZEROCOLSTOREDS1 \n\t" // Taking care of the beta==0 case.
|
||||
BEQ(DBETAZEROCOLSTOREDS1) // Taking care of the beta==0 case.
|
||||
" \n\t"
|
||||
" ldr q0, [x2] \n\t" //Load column 0 of C
|
||||
" ldr q1, [x2, #16] \n\t"
|
||||
@@ -1642,7 +1643,7 @@ __asm__ volatile
|
||||
" fmul v4.2d,v4.2d,v7.d[0] \n\t" // Scale by beta
|
||||
" fmul v5.2d,v5.2d,v7.d[0] \n\t" // Scale by beta
|
||||
" \n\t"
|
||||
" .DBETAZEROCOLSTOREDS1: \n\t"
|
||||
LABEL(DBETAZEROCOLSTOREDS1)
|
||||
" \n\t"
|
||||
" fmla v0.2d,v8.2d,v6.d[0] \n\t" // Scale by alpha
|
||||
" fmla v1.2d,v9.2d,v6.d[0] \n\t" // Scale by alpha
|
||||
@@ -1667,7 +1668,7 @@ __asm__ volatile
|
||||
" dup v13.2d, xzr \n\t"
|
||||
" \n\t"
|
||||
" fcmp d7,#0.0 \n\t"
|
||||
" beq .DBETAZEROCOLSTOREDS2 \n\t" // Taking care of the beta==0 case.
|
||||
BEQ(DBETAZEROCOLSTOREDS2) // Taking care of the beta==0 case.
|
||||
" \n\t"
|
||||
" ldr q8, [x21] \n\t" //Load column 2 of C
|
||||
" ldr q9, [x21, #16] \n\t"
|
||||
@@ -1684,7 +1685,7 @@ __asm__ volatile
|
||||
" fmul v12.2d,v12.2d,v7.d[0] \n\t" // Scale by beta
|
||||
" fmul v13.2d,v13.2d,v7.d[0] \n\t" // Scale by beta
|
||||
" \n\t"
|
||||
" .DBETAZEROCOLSTOREDS2: \n\t"
|
||||
LABEL(DBETAZEROCOLSTOREDS2)
|
||||
" \n\t"
|
||||
" fmla v8.2d, v14.2d,v6.d[0] \n\t" // Scale by alpha
|
||||
" fmla v9.2d, v15.2d,v6.d[0] \n\t" // Scale by alpha
|
||||
@@ -1709,7 +1710,7 @@ __asm__ volatile
|
||||
" dup v5.2d, xzr \n\t"
|
||||
" \n\t"
|
||||
" fcmp d7,#0.0 \n\t"
|
||||
" beq .DBETAZEROCOLSTOREDS3 \n\t" // Taking care of the beta==0 case.
|
||||
BEQ(DBETAZEROCOLSTOREDS3) // Taking care of the beta==0 case.
|
||||
" \n\t"
|
||||
" ldr q0, [x23] \n\t" //Load column 4 of C
|
||||
" ldr q1, [x23, #16] \n\t"
|
||||
@@ -1726,7 +1727,7 @@ __asm__ volatile
|
||||
" fmul v4.2d,v4.2d,v7.d[0] \n\t" // Scale by beta
|
||||
" fmul v5.2d,v5.2d,v7.d[0] \n\t" // Scale by beta
|
||||
" \n\t"
|
||||
" .DBETAZEROCOLSTOREDS3: \n\t"
|
||||
LABEL(DBETAZEROCOLSTOREDS3)
|
||||
" \n\t"
|
||||
" fmla v0.2d,v20.2d,v6.d[0] \n\t" // Scale by alpha
|
||||
" fmla v1.2d,v21.2d,v6.d[0] \n\t" // Scale by alpha
|
||||
@@ -1751,7 +1752,7 @@ __asm__ volatile
|
||||
" dup v13.2d, xzr \n\t"
|
||||
" \n\t"
|
||||
" fcmp d7,#0.0 \n\t"
|
||||
" beq .DBETAZEROCOLSTOREDS4 \n\t" // Taking care of the beta==0 case.
|
||||
BEQ(DBETAZEROCOLSTOREDS4) // Taking care of the beta==0 case.
|
||||
" \n\t"
|
||||
" ldr q8, [x25] \n\t" //Load column 6 of C
|
||||
" ldr q9, [x25, #16] \n\t"
|
||||
@@ -1768,7 +1769,7 @@ __asm__ volatile
|
||||
" fmul v12.2d,v12.2d,v7.d[0] \n\t" // Scale by beta
|
||||
" fmul v13.2d,v13.2d,v7.d[0] \n\t" // Scale by beta
|
||||
" \n\t"
|
||||
" .DBETAZEROCOLSTOREDS4: \n\t"
|
||||
LABEL(DBETAZEROCOLSTOREDS4)
|
||||
" \n\t"
|
||||
" prfm pldl2keep,[x3] \n\t"
|
||||
" prfm pldl2keep,[x4] \n\t"
|
||||
@@ -1788,9 +1789,9 @@ __asm__ volatile
|
||||
" str q12, [x26, #16] \n\t"
|
||||
" str q13, [x26, #32] \n\t"
|
||||
" \n\t"
|
||||
" b .DEND \n\t"
|
||||
BRANCH(DEND)
|
||||
" \n\t"
|
||||
" .DGENSTORED: \n\t" // C is general-stride stored.
|
||||
LABEL(DGENSTORED) // C is general-stride stored.
|
||||
" \n\t"
|
||||
" dup v0.2d, xzr \n\t"
|
||||
" dup v1.2d, xzr \n\t"
|
||||
@@ -1800,7 +1801,7 @@ __asm__ volatile
|
||||
" dup v5.2d, xzr \n\t"
|
||||
" \n\t"
|
||||
" fcmp d7,#0.0 \n\t"
|
||||
" beq .DBETAZEROGENSTOREDS1 \n\t" // Taking care of the beta==0 case.
|
||||
BEQ(DBETAZEROGENSTOREDS1) // Taking care of the beta==0 case.
|
||||
" \n\t"
|
||||
" mov x27, x2 \n\t"
|
||||
" \n\t" // Load address of C.
|
||||
@@ -1827,7 +1828,7 @@ __asm__ volatile
|
||||
" fmul v4.2d,v4.2d,v7.d[0] \n\t" // Scale by beta
|
||||
" fmul v5.2d,v5.2d,v7.d[0] \n\t" // Scale by beta
|
||||
" \n\t"
|
||||
" .DBETAZEROGENSTOREDS1: \n\t"
|
||||
LABEL(DBETAZEROGENSTOREDS1)
|
||||
" \n\t"
|
||||
" fmla v0.2d,v8.2d,v6.d[0] \n\t" // Scale by alpha
|
||||
" fmla v1.2d,v9.2d,v6.d[0] \n\t" // Scale by alpha
|
||||
@@ -1862,7 +1863,7 @@ __asm__ volatile
|
||||
" dup v13.2d, xzr \n\t"
|
||||
" \n\t"
|
||||
" fcmp d7,#0.0 \n\t"
|
||||
" beq .DBETAZEROGENSTOREDS2 \n\t" // Taking care of the beta==0 case.
|
||||
BEQ(DBETAZEROGENSTOREDS2) // Taking care of the beta==0 case.
|
||||
" \n\t"
|
||||
" mov x27, x21 \n\t" // Load address of C.
|
||||
" \n\t"
|
||||
@@ -1889,7 +1890,7 @@ __asm__ volatile
|
||||
" fmul v12.2d,v12.2d,v7.d[0] \n\t" // Scale by beta
|
||||
" fmul v13.2d,v13.2d,v7.d[0] \n\t" // Scale by beta
|
||||
" \n\t"
|
||||
" .DBETAZEROGENSTOREDS2: \n\t"
|
||||
LABEL(DBETAZEROGENSTOREDS2)
|
||||
" \n\t"
|
||||
" fmla v8.2d, v14.2d,v6.d[0] \n\t" // Scale by alpha
|
||||
" fmla v9.2d, v15.2d,v6.d[0] \n\t" // Scale by alpha
|
||||
@@ -1924,7 +1925,7 @@ __asm__ volatile
|
||||
" dup v5.2d, xzr \n\t"
|
||||
" \n\t"
|
||||
" fcmp d7,#0.0 \n\t"
|
||||
" beq .DBETAZEROGENSTOREDS3 \n\t" // Taking care of the beta==0 case.
|
||||
BEQ(DBETAZEROGENSTOREDS3) // Taking care of the beta==0 case.
|
||||
" \n\t"
|
||||
" mov x27, x23 \n\t" // Load address of C.
|
||||
" \n\t"
|
||||
@@ -1951,7 +1952,7 @@ __asm__ volatile
|
||||
" fmul v4.2d,v4.2d,v7.d[0] \n\t" // Scale by beta
|
||||
" fmul v5.2d,v5.2d,v7.d[0] \n\t" // Scale by beta
|
||||
" \n\t"
|
||||
" .DBETAZEROGENSTOREDS3: \n\t"
|
||||
LABEL(DBETAZEROGENSTOREDS3)
|
||||
" \n\t"
|
||||
" fmla v0.2d,v20.2d,v6.d[0] \n\t" // Scale by alpha
|
||||
" fmla v1.2d,v21.2d,v6.d[0] \n\t" // Scale by alpha
|
||||
@@ -1986,7 +1987,7 @@ __asm__ volatile
|
||||
" dup v13.2d, xzr \n\t"
|
||||
" \n\t"
|
||||
" fcmp d7,#0.0 \n\t"
|
||||
" beq .DBETAZEROGENSTOREDS4 \n\t" // Taking care of the beta==0 case.
|
||||
BEQ(DBETAZEROGENSTOREDS4) // Taking care of the beta==0 case.
|
||||
" \n\t"
|
||||
" mov x27, x25 \n\t"
|
||||
" \n\t"
|
||||
@@ -2013,7 +2014,7 @@ __asm__ volatile
|
||||
" fmul v12.2d,v12.2d,v7.d[0] \n\t" // Scale by beta
|
||||
" fmul v13.2d,v13.2d,v7.d[0] \n\t" // Scale by beta
|
||||
" \n\t"
|
||||
" .DBETAZEROGENSTOREDS4: \n\t"
|
||||
LABEL(DBETAZEROGENSTOREDS4)
|
||||
" \n\t"
|
||||
" prfm pldl2keep,[x3] \n\t"
|
||||
" prfm pldl2keep,[x4] \n\t"
|
||||
@@ -2043,7 +2044,7 @@ __asm__ volatile
|
||||
" st1 {v13.d}[0],[x27],x14 \n\t" // Store c74 into quad and increment by rs_c.
|
||||
" st1 {v13.d}[1],[x27],x14 \n\t" // Store c75 into quad and increment by rs_c.
|
||||
" \n\t"
|
||||
" .DEND: \n\t" // Done!
|
||||
LABEL(DEND) // Done!
|
||||
" \n\t"
|
||||
:// output operands (none)
|
||||
:// input operands
|
||||
|
||||
Reference in New Issue
Block a user