Merge pull request #35 from figual/master

Fixed incomplete code in the double precision ARMv8 microkernel.
This commit is contained in:
Field G. Van Zee
2015-10-27 12:41:23 -05:00
3 changed files with 260 additions and 62 deletions

View File

@@ -51,12 +51,12 @@
// (b) MR (for zero-padding purposes when MR and NR are "swapped")
//
#define BLIS_DEFAULT_MC_S 416 // 1280 //160 // 160 // 160 //2048 //336
#define BLIS_DEFAULT_KC_S 704 //1280 //672 //528 // 856 //2048 //528
#define BLIS_DEFAULT_MC_S 336
#define BLIS_DEFAULT_KC_S 336
#define BLIS_DEFAULT_NC_S 4096
#define BLIS_DEFAULT_MC_D 80 //176
#define BLIS_DEFAULT_KC_D 336 //368
#define BLIS_DEFAULT_MC_D 160
#define BLIS_DEFAULT_KC_D 304
#define BLIS_DEFAULT_NC_D 4096
#define BLIS_DEFAULT_MC_C 64
@@ -132,8 +132,6 @@
//#define BLIS_PACKDIM_MR_Z (BLIS_DEFAULT_MR_Z + ...)
//#define BLIS_PACKDIM_NR_Z (BLIS_DEFAULT_NR_Z + ...)
// -- LEVEL-2 KERNEL CONSTANTS -------------------------------------------------

View File

@@ -76,20 +76,18 @@ GIT_LOG := $(GIT) log --decorate
#
# --- Determine the C compiler and related flags ---
##CC := gcc
CC := aarch64-linux-gnu-gcc
#CC := arm-linux-gnueabihf-gcc-4.9.2
CC := gcc
# Enable IEEE Standard 1003.1-2004 (POSIX.1d).
# NOTE: This is needed to enable posix_memalign().
CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L
#CMISCFLAGS := -std=c99 -mtune=cortex-a57 -mfpu=neon-fp-armv8 -march=armv8-a #-mfloat-abi=hard -mfpu=neon
CMISCFLAGS := -std=c99 -march=armv8-a+fp+simd -ftree-vectorize -O3 -mcpu=cortex-a57 -mtune=cortex-a57 #-mtune=cortex-a57 -march=armv8-a -mfloat-abi=hard -mfpu=neon
CPPROCFLAGS := -D_GNU_SOURCE
CMISCFLAGS := -std=c99 -march=armv8-a+fp+simd -ftree-vectorize -O3 -fopenmp -mcpu=cortex-a57.cortex-a53 -mtune=cortex-a57.cortex-a53
CPICFLAGS := -fPIC
CDBGFLAGS := -g
CDBGFLAGS := -g #-g3 -gdwarf-2
CWARNFLAGS := -Wall
COPTFLAGS := -march=armv8-a+fp+simd -ftree-vectorize -O3 -mcpu=cortex-a57 -mtune=cortex-a57 #-march=armv8-a -O2 -mtune=cortex-a57 -mfpu=neon-fp-armv8 #-mfpu=neon -O2
COPTFLAGS := -march=armv8-a+fp+simd -ftree-vectorize -O3 -mcpu=cortex-a57.cortex-a53 -mtune=cortex-a57.cortex-a53
CKOPTFLAGS := $(COPTFLAGS)
CVECFLAGS := -march=armv8-a+fp+simd -ftree-vectorize -O3 -mcpu=cortex-a57 -mtune=cortex-a57 #-march=armv8-a -O2 -mtune=cortex-a57 -mfpu=neon-fp-armv8
CVECFLAGS := #-march=armv8-a+fp+simd -ftree-vectorize -O3 -mcpu=cortex-a57.cortex-a53 -mtune=cortex-a57.cortex-a53 #-march=armv8-a -O2 -mtune=cortex-a57 -mfpu=neon-fp-armv8
# Aggregate all of the flags into multiple groups: one for standard
# compilation, and one for each of the supported "special" compilation
@@ -105,7 +103,7 @@ ARFLAGS := cru
# --- Determine the linker and related flags ---
LINKER := $(CC)
SOFLAGS := -shared
LDFLAGS := -lm
LDFLAGS := -lm -fopenmp

View File

@@ -1,4 +1,4 @@
/*
/*
BLIS
An object-based framework for developing high-performance BLAS-like
@@ -36,12 +36,7 @@
#include "blis.h"
/*
o 4x4 Single precision micro-kernel fully functional.
o Runnable on ARMv8, compiled with aarch64 GCC.
o Use it together with the armv8 BLIS configuration.
o Tested on Juno board. Around 7.3 GFLOPS @ 1.1 GHz.
December 2014.
*/
void bli_sgemm_opt_4x4(
dim_t k,
@@ -55,15 +50,15 @@ void bli_sgemm_opt_4x4(
{
void* a_next = bli_auxinfo_next_a( data );
void* b_next = bli_auxinfo_next_b( data );
dim_t k_iter = k / 4;
dim_t k_left = k % 4;
dim_t k_left = k % 4;
__asm__ volatile
__asm__ volatile
(
" \n\t"
" \n\t"
" ldr x0,%[aaddr] \n\t" // Load address of A.
" ldr x0,%[aaddr] \n\t" // Load address of A.
" ldr x1,%[baddr] \n\t" // Load address of B.
" ldr x2,%[caddr] \n\t" // Load address of C.
" \n\t"
@@ -433,12 +428,8 @@ __asm__ volatile
/*
o 4x4 Double precision micro-kernel NOT fully functional yet.
o Runnable on ARMv8, compiled with aarch64 GCC.
o Use it together with the armv8 BLIS configuration.
o Tested on Juno board. Around 3 GFLOPS @ 1.1 GHz.
December 2014.
* Tested on Juno board. Around 5.6 GFLOPS, 2 A57 cores @ 1.1 GHz.
* Tested on Juno board. Around 4 GFLOPS, 4 A53 cores @ 850 MHz.
*/
void bli_dgemm_opt_4x4(
dim_t k,
@@ -458,8 +449,8 @@ void bli_dgemm_opt_4x4(
__asm__ volatile
(
" \n\t"
" ldr x0,%[aaddr] \n\t" // Load address of A
" \n\t"
" ldr x0,%[aaddr] \n\t" // Load address of A
" ldr x1,%[baddr] \n\t" // Load address of B
" ldr x2,%[caddr] \n\t" // Load address of C
" \n\t"
@@ -476,9 +467,16 @@ __asm__ volatile
" \n\t"
" ldr x9,%[cs_c] \n\t" // Load cs_c
" lsl x10,x9,#3 \n\t" // cs_c * sizeof(double)
" \n\t"
" ldp q0,q1,[x0],32 \n\t" // Load a into quad
" ldp q4,q5,[x1],32 \n\t" // Load b into quad
" lsl x11,x9,#4 \n\t" // 2 * cs_c * sizeof(double) -- AUX.
" lsl x12,x9,#5 \n\t" // 3 * cs_c * sizeof(double) -- AUX.
" \n\t"
" ldr x13,%[rs_c] \n\t" // Load rs_c.
" lsl x14,x13,#3 \n\t" // rs_c * sizeof(double).
" \n\t"
" prfm pldl1keep,[x2,0] \n\t" // Prefetch c.
" prfm pldl1keep,[x2,x10] \n\t" // Prefetch c.
" prfm pldl1keep,[x2,x11] \n\t" // Prefetch c.
" prfm pldl1keep,[x2,x12] \n\t" // Prefetch c.
" \n\t"
" movi v12.2d,#0 \n\t" // Vector for accummulating column 0
" movi v13.2d,#0 \n\t" // Vector for accummulating column 0
@@ -489,10 +487,33 @@ __asm__ volatile
" movi v18.2d,#0 \n\t" // Vector for accummulating column 3
" movi v19.2d,#0 \n\t" // Vector for accummulating column 3
" \n\t"
" movi v20.2d,#0 \n\t" // Vector for accummulating column 0
" movi v21.2d,#0 \n\t" // Vector for accummulating column 0
" movi v22.2d,#0 \n\t" // Vector for accummulating column 1
" movi v23.2d,#0 \n\t" // Vector for accummulating column 1
" movi v24.2d,#0 \n\t" // Vector for accummulating column 2
" movi v25.2d,#0 \n\t" // Vector for accummulating column 2
" movi v26.2d,#0 \n\t" // Vector for accummulating column 3
" movi v27.2d,#0 \n\t" // Vector for accummulating column 3
" \n\t"
" ld1r {v31.2d},[x8] \n\t" // Load beta
" \n\t"
" \n\t"
" cmp x5,#0 \n\t" // If k_iter == 0, jump to k_left.
" beq .DCONSIDERKLEFT \n\t"
" \n\t"
" ldp q0,q1,[x0],32 \n\t" // Load a
" ldp q4,q5,[x1],32 \n\t" // Load b
" \n\t"
" cmp x5,1 \n\t" // If there is just one k_iter, jump to that one.
" beq .DLASTITER \n\t" // (as loop is do-while-like).
" \n\t"
" \n\t"
" DLOOP: \n\t" // Body
" \n\t"
" prfm pldl1keep,[x0,#1024] \n\t" // Prefetch.
" prfm pldl1keep,[x1,#1024] \n\t" // Prefetch.
" \n\t"
" ldp q6,q7,[x1],32 \n\t" // Load b+4 into quad
" \n\t"
" fmla v12.2d,v0.2d,v4.d[0] \n\t" // Accummulate
@@ -525,9 +546,86 @@ __asm__ volatile
" fmla v17.2d,v3.2d,v7.d[0] \n\t" // Accummulate
" fmla v19.2d,v3.2d,v7.d[1] \n\t" // Accummulate
" \n\t"
" add x4,x4,1 \n\t" // i = i+1
" cmp x4,x5 \n\t" // Continue
" blt DLOOP \n\t" // if i < N
" prfm pldl1keep,[x0,#64] \n\t" // Prefetch.
" prfm pldl1keep,[x1,#64] \n\t" // Prefetch.
" \n\t"
" sub x5,x5,1 \n\t" // i-=1
" cmp x5,1 \n\t" // Iterate again if we are not in k_iter == 1.
"bne DLOOP \n\t"
" \n\t"
".DLASTITER: \n\t"
" \n\t"
" ldp q6,q7,[x1],32 \n\t" // Load b+4 into quad
" \n\t"
" fmla v12.2d,v0.2d,v4.d[0] \n\t" // Accummulate
" fmla v14.2d,v0.2d,v4.d[1] \n\t" // Accummulate
" \n\t"
" fmla v13.2d,v1.2d,v4.d[0] \n\t" // Accummulate
" fmla v15.2d,v1.2d,v4.d[1] \n\t" // Accummulate
" \n\t"
" ldp q2,q3,[x0],32 \n\t" // Load a+4 into quad
" \n\t"
" fmla v16.2d,v0.2d,v5.d[0] \n\t" // Accummulate
" fmla v18.2d,v0.2d,v5.d[1] \n\t" // Accummulate
" \n\t"
" fmla v17.2d,v1.2d,v5.d[0] \n\t" // Accummulate
" fmla v19.2d,v1.2d,v5.d[1] \n\t" // Accummulate
" \n\t"
" ld1r {v30.2d},[x7] \n\t" // Load alpha.
" \n\t"
" fmla v12.2d,v2.2d,v6.d[0] \n\t" // Accummulate
" fmla v14.2d,v2.2d,v6.d[1] \n\t" // Accummulate
" \n\t"
" fmla v13.2d,v3.2d,v6.d[0] \n\t" // Accummulate
" fmla v15.2d,v3.2d,v6.d[1] \n\t" // Accummulate
" \n\t"
" fmla v16.2d,v2.2d,v7.d[0] \n\t" // Accummulate
" fmla v18.2d,v2.2d,v7.d[1] \n\t" // Accummulate
" \n\t"
" fmla v17.2d,v3.2d,v7.d[0] \n\t" // Accummulate
" fmla v19.2d,v3.2d,v7.d[1] \n\t" // Accummulate
" \n\t"
" .DCONSIDERKLEFT: \n\t"
" cmp x6,0 \n\t" // If k_left == 0, we are done.
" beq .DPOSTACCUM \n\t" // else, we enter the k_left loop.
" \n\t"
".DLOOPKLEFT: \n\t"
" \n\t"
" prfm pldl1keep,[x0,#1024] \n\t" // Prefetch.
" prfm pldl1keep,[x1,#1024] \n\t" // Prefetch.
" \n\t"
" ldp q0,q1,[x0],32 \n\t" // Load a into quad
" ldp q4,q5,[x1],32 \n\t" // Load b into quad
" \n\t"
//" sub x6,x6,1 \n\t"
" \n\t"
" fmla v12.2d,v0.2d,v4.d[0] \n\t" // Accummulate
" fmla v14.2d,v0.2d,v4.d[1] \n\t" // Accummulate
" \n\t"
" fmla v13.2d,v1.2d,v4.d[0] \n\t" // Accummulate
" fmla v15.2d,v1.2d,v4.d[1] \n\t" // Accummulate
" \n\t"
" fmla v16.2d,v0.2d,v5.d[0] \n\t" // Accummulate
" fmla v18.2d,v0.2d,v5.d[1] \n\t" // Accummulate
" \n\t"
" fmla v17.2d,v1.2d,v5.d[0] \n\t" // Accummulate
" fmla v19.2d,v1.2d,v5.d[1] \n\t" // Accummulate
" \n\t"
//" cmp x6,0 \n\t" // Iterate again.
//" bne .DLOOPKLEFT \n\t" // if i!=0.
" \n\t"
" .DPOSTACCUM: \n\t"
" ld1r {v30.2d},[x7] \n\t" // Load alpha.
" \n\t"
" cmp x13,#1 \n\t" // If rs_c != 1 (column-major)
" bne .DGENSTORED \n\t"
" \n\t"
" .DCOLSTORED: \n\t" // C is column-major.
" fcmp d31,#0.0 \n\t"
" beq .DBETAZEROCOLSTORED \n\t" // Taking care of the beta==0 case.
" \n\t"
" ldr x2,%[caddr] \n\t" // Load address of C.
" \n\t"
" \n\t"
" ldp q0,q1,[x2] \n\t" // Load c into quad and increment by cs_c
" add x2,x2,x10 \n\t"
@@ -537,38 +635,140 @@ __asm__ volatile
" add x2,x2,x10 \n\t"
" ldp q6,q7,[x2] \n\t" // Load c into quad and increment by cs_c
" \n\t"
" ld1r {v30.2d},[x7] \n\t" // Load alpha
" \n\t"
" fmul v0.2d,v0.2d,v31.d[0] \n\t" // Scale by beta
" fmul v1.2d,v1.2d,v31.d[0] \n\t" // Scale by beta
" fmul v2.2d,v2.2d,v31.d[0] \n\t" // Scale by beta
" fmul v3.2d,v3.2d,v31.d[0] \n\t" // Scale by beta
" fmul v4.2d,v4.2d,v31.d[0] \n\t" // Scale by beta
" fmul v5.2d,v5.2d,v31.d[0] \n\t" // Scale by beta
" fmul v6.2d,v6.2d,v31.d[0] \n\t" // Scale by beta
" fmul v7.2d,v7.2d,v31.d[0] \n\t" // Scale by beta
" fmul v20.2d,v0.2d,v31.d[0] \n\t" // Scale by beta
" fmul v21.2d,v1.2d,v31.d[0] \n\t" // Scale by beta
" fmul v22.2d,v2.2d,v31.d[0] \n\t" // Scale by beta
" fmul v23.2d,v3.2d,v31.d[0] \n\t" // Scale by beta
" fmul v24.2d,v4.2d,v31.d[0] \n\t" // Scale by beta
" fmul v25.2d,v5.2d,v31.d[0] \n\t" // Scale by beta
" fmul v26.2d,v6.2d,v31.d[0] \n\t" // Scale by beta
" fmul v27.2d,v7.2d,v31.d[0] \n\t" // Scale by beta
" \n\t"
" prfm pldl2keep,[x16] \n\t"
" prfm pldl2keep,[x17] \n\t"
" \n\t"
" fmla v0.2d,v12.2d,v30.d[0] \n\t" // Scale by alpha
" fmla v1.2d,v13.2d,v30.d[0] \n\t" // Scale by alpha
" fmla v2.2d,v14.2d,v30.d[0] \n\t" // Scale by alpha
" fmla v3.2d,v15.2d,v30.d[0] \n\t" // Scale by alpha
" fmla v4.2d,v16.2d,v30.d[0] \n\t" // Scale by alpha
" fmla v5.2d,v17.2d,v30.d[0] \n\t" // Scale by alpha
" fmla v6.2d,v18.2d,v30.d[0] \n\t" // Scale by alpha
" fmla v7.2d,v19.2d,v30.d[0] \n\t" // Scale by alpha
" .DBETAZEROCOLSTORED: \n\t" // If beta==0, we won't read from C (nor scale).
" \n\t"
" ldr x2,%[caddr] \n\t" // Load address of C
" \n\t"
" stp q0,q1,[x2] \n\t" // Store quad into c and increment by cs_c
" fmla v20.2d,v12.2d,v30.d[0] \n\t" // Scale by alpha
" fmla v21.2d,v13.2d,v30.d[0] \n\t" // Scale by alpha
" fmla v22.2d,v14.2d,v30.d[0] \n\t" // Scale by alpha
" fmla v23.2d,v15.2d,v30.d[0] \n\t" // Scale by alpha
" fmla v24.2d,v16.2d,v30.d[0] \n\t" // Scale by alpha
" fmla v25.2d,v17.2d,v30.d[0] \n\t" // Scale by alpha
" fmla v26.2d,v18.2d,v30.d[0] \n\t" // Scale by alpha
" fmla v27.2d,v19.2d,v30.d[0] \n\t" // Scale by alpha
" \n\t"
" stp q20,q21,[x2] \n\t" // Store quad into c and increment by cs_c
" add x2,x2,x10 \n\t"
" stp q2,q3,[x2] \n\t" // Store quad into c+4 and increment by cs_c
" stp q22,q23,[x2] \n\t" // Store quad into c+4 and increment by cs_c
" add x2,x2,x10 \n\t"
" stp q4,q5,[x2] \n\t" // Store quad into c+8 and increment by cs_c
" stp q24,q25,[x2] \n\t" // Store quad into c+8 and increment by cs_c
" add x2,x2,x10 \n\t"
" stp q6,q7,[x2] \n\t" // Store quad into c+16 and increment by cs_c
" stp q26,q27,[x2] \n\t" // Store quad into c+16 and increment by cs_c
" \n\t"
" b .DEND \n\t" // Done (TODO: this obviously needs to be moved down to remove jump).
" \n\t"
" .DGENSTORED: \n\t" // C is general-stride stored.
" \n\t"
" fcmp d31,#0.0 \n\t"
" beq .DBETAZEROGENSTORED \n\t"
" \n\t" // If beta!=0, then we can read from C.
" \n\t" // TODO: this was done fast. Rearrange to remove so many address reloads.
" ldr x2,%[caddr] \n\t" // Load address of C.
" \n\t"
" ld1 {v0.d}[0],[x2],x14 \n\t" // Load c00 into quad and increment by rs_c.
" ld1 {v0.d}[1],[x2],x14 \n\t" // Load c01 into quad and increment by rs_c.
" ld1 {v1.d}[0],[x2],x14 \n\t" // Load c02 into quad and increment by rs_c.
" ld1 {v1.d}[1],[x2],x14 \n\t" // Load c03 into quad and increment by rs_c.
" \n\t"
" ldr x2,%[caddr] \n\t" // Load address of C.
" add x2,x2,x10 \n\t" // c += cs_c.
" \n\t"
" ld1 {v2.d}[0],[x2],x14 \n\t" // Load c10 into quad and increment by rs_c.
" ld1 {v2.d}[1],[x2],x14 \n\t" // Load c11 into quad and increment by rs_c.
" ld1 {v3.d}[0],[x2],x14 \n\t" // Load c12 into quad and increment by rs_c.
" ld1 {v3.d}[1],[x2],x14 \n\t" // Load c13 into quad and increment by rs_c.
" \n\t"
" ldr x2,%[caddr] \n\t" // Load address of C.
" add x2,x2,x10 \n\t" // c += cs_c.
" add x2,x2,x10 \n\t" // c += cs_c.
" \n\t"
" ld1 {v4.d}[0],[x2],x14 \n\t" // Load c10 into quad and increment by rs_c.
" ld1 {v4.d}[1],[x2],x14 \n\t" // Load c11 into quad and increment by rs_c.
" ld1 {v5.d}[0],[x2],x14 \n\t" // Load c12 into quad and increment by rs_c.
" ld1 {v5.d}[1],[x2],x14 \n\t" // Load c13 into quad and increment by rs_c.
" \n\t"
" ldr x2,%[caddr] \n\t" // Load address of C.
" add x2,x2,x10 \n\t" // c += cs_c.
" add x2,x2,x10 \n\t" // c += cs_c.
" add x2,x2,x10 \n\t" // c += cs_c.
" \n\t"
" ld1 {v6.d}[0],[x2],x14 \n\t" // Load c10 into quad and increment by rs_c.
" ld1 {v6.d}[1],[x2],x14 \n\t" // Load c11 into quad and increment by rs_c.
" ld1 {v7.d}[0],[x2],x14 \n\t" // Load c12 into quad and increment by rs_c.
" ld1 {v7.d}[1],[x2],x14 \n\t" // Load c13 into quad and increment by rs_c.
" \n\t"
" prfm pldl1keep,[x16,0] \n\t" // Prefetch.
" prfm pldl1keep,[x17,0] \n\t" // Prefetch.
" \n\t"
" fmul v20.2d,v0.2d,v31.d[0] \n\t" // Scale by beta
" fmul v21.2d,v1.2d,v31.d[0] \n\t" // Scale by beta
" fmul v22.2d,v2.2d,v31.d[0] \n\t" // Scale by beta
" fmul v23.2d,v3.2d,v31.d[0] \n\t" // Scale by beta
" fmul v24.2d,v4.2d,v31.d[0] \n\t" // Scale by beta
" fmul v25.2d,v5.2d,v31.d[0] \n\t" // Scale by beta
" fmul v26.2d,v6.2d,v31.d[0] \n\t" // Scale by beta
" fmul v27.2d,v7.2d,v31.d[0] \n\t" // Scale by beta
" \n\t"
" .DBETAZEROGENSTORED: \n\t" // If beta==0, we cannot read from C (nor scale).
" \n\t"
" fmla v20.2d,v12.2d,v30.d[0] \n\t" // Scale by alpha
" fmla v21.2d,v13.2d,v30.d[0] \n\t" // Scale by alpha
" fmla v22.2d,v14.2d,v30.d[0] \n\t" // Scale by alpha
" fmla v23.2d,v15.2d,v30.d[0] \n\t" // Scale by alpha
" fmla v24.2d,v16.2d,v30.d[0] \n\t" // Scale by alpha
" fmla v25.2d,v17.2d,v30.d[0] \n\t" // Scale by alpha
" fmla v26.2d,v18.2d,v30.d[0] \n\t" // Scale by alpha
" fmla v27.2d,v19.2d,v30.d[0] \n\t" // Scale by alpha
" \n\t"
" ldr x2,%[caddr] \n\t" // Load address of C.
" \n\t"
" st1 {v20.d}[0],[x2],x14 \n\t" // Store c00 into quad and increment by rs_c.
" st1 {v20.d}[1],[x2],x14 \n\t" // Store c01 into quad and increment by rs_c.
" st1 {v21.d}[0],[x2],x14 \n\t" // Store c02 into quad and increment by rs_c.
" st1 {v21.d}[1],[x2],x14 \n\t" // Store c03 into quad and increment by rs_c.
" \n\t"
" ldr x2,%[caddr] \n\t" // Load address of C.
" add x2,x2,x10 \n\t" // c += cs_c.
" \n\t"
" st1 {v22.d}[0],[x2],x14 \n\t" // Store c10 into quad and increment by rs_c.
" st1 {v22.d}[1],[x2],x14 \n\t" // Store c11 into quad and increment by rs_c.
" st1 {v23.d}[0],[x2],x14 \n\t" // Store c12 into quad and increment by rs_c.
" st1 {v23.d}[1],[x2],x14 \n\t" // Store c13 into quad and increment by rs_c.
" \n\t"
" ldr x2,%[caddr] \n\t" // Load address of C.
" add x2,x2,x10 \n\t" // c += cs_c.
" add x2,x2,x10 \n\t" // c += cs_c.
" \n\t"
" st1 {v24.d}[0],[x2],x14 \n\t" // Store c10 into quad and increment by rs_c.
" st1 {v24.d}[1],[x2],x14 \n\t" // Store c11 into quad and increment by rs_c.
" st1 {v25.d}[0],[x2],x14 \n\t" // Store c12 into quad and increment by rs_c.
" st1 {v25.d}[1],[x2],x14 \n\t" // Store c13 into quad and increment by rs_c.
" \n\t"
" ldr x2,%[caddr] \n\t" // Load address of C.
" add x2,x2,x10 \n\t" // c += cs_c.
" add x2,x2,x10 \n\t" // c += cs_c.
" add x2,x2,x10 \n\t" // c += cs_c.
" \n\t"
" st1 {v26.d}[0],[x2],x14 \n\t" // Store c10 into quad and increment by rs_c.
" st1 {v26.d}[1],[x2],x14 \n\t" // Store c11 into quad and increment by rs_c.
" st1 {v27.d}[0],[x2],x14 \n\t" // Store c12 into quad and increment by rs_c.
" st1 {v27.d}[1],[x2],x14 \n\t" // Store c13 into quad and increment by rs_c.
" \n\t"
" .DEND: \n\t" // Done!
" \n\t"
:// output operands (none)
:// input operands
@@ -587,13 +787,15 @@ __asm__ volatile
"x0","x1","x2",
"x4","x5","x6",
"x7","x8","x9",
"x10","x11","x12",
"x10","x11","x12","x13","x14","x16","x17",
"v0","v1","v2",
"v3","v4","v5",
"v6","v7","v8",
"v9","v10","v11",
"v12","v13","v14",
"v15","v16","v17","v18","v19",
"v20","v21","v22","v23",
"v24","v25","v26","v27",
"v30","v31"
);