From a0a7b85ac3e157af53cff8db0e008f4a3f90372c Mon Sep 17 00:00:00 2001 From: Francisco Igual Date: Tue, 27 Oct 2015 08:59:15 +0000 Subject: [PATCH] Fixed incomplete code in the double precision ARMv8 microkernel. --- config/armv8a/bli_kernel.h | 10 +- config/armv8a/make_defs.mk | 18 +- kernels/armv8a/neon/3/bli_gemm_opt_4x4.c | 294 +++++++++++++++++++---- 3 files changed, 260 insertions(+), 62 deletions(-) diff --git a/config/armv8a/bli_kernel.h b/config/armv8a/bli_kernel.h index 2e2705122..3bd7da722 100644 --- a/config/armv8a/bli_kernel.h +++ b/config/armv8a/bli_kernel.h @@ -51,12 +51,12 @@ // (b) MR (for zero-padding purposes when MR and NR are "swapped") // -#define BLIS_DEFAULT_MC_S 416 // 1280 //160 // 160 // 160 //2048 //336 -#define BLIS_DEFAULT_KC_S 704 //1280 //672 //528 // 856 //2048 //528 +#define BLIS_DEFAULT_MC_S 336 +#define BLIS_DEFAULT_KC_S 336 #define BLIS_DEFAULT_NC_S 4096 -#define BLIS_DEFAULT_MC_D 80 //176 -#define BLIS_DEFAULT_KC_D 336 //368 +#define BLIS_DEFAULT_MC_D 160 +#define BLIS_DEFAULT_KC_D 304 #define BLIS_DEFAULT_NC_D 4096 #define BLIS_DEFAULT_MC_C 64 @@ -132,8 +132,6 @@ //#define BLIS_PACKDIM_MR_Z (BLIS_DEFAULT_MR_Z + ...) //#define BLIS_PACKDIM_NR_Z (BLIS_DEFAULT_NR_Z + ...) - - // -- LEVEL-2 KERNEL CONSTANTS ------------------------------------------------- diff --git a/config/armv8a/make_defs.mk b/config/armv8a/make_defs.mk index e81dc5026..63c03c6a0 100644 --- a/config/armv8a/make_defs.mk +++ b/config/armv8a/make_defs.mk @@ -76,20 +76,18 @@ GIT_LOG := $(GIT) log --decorate # # --- Determine the C compiler and related flags --- -##CC := gcc -CC := aarch64-linux-gnu-gcc -#CC := arm-linux-gnueabihf-gcc-4.9.2 +CC := gcc + # Enable IEEE Standard 1003.1-2004 (POSIX.1d). # NOTE: This is needed to enable posix_memalign(). -CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L -#CMISCFLAGS := -std=c99 -mtune=cortex-a57 -mfpu=neon-fp-armv8 -march=armv8-a #-mfloat-abi=hard -mfpu=neon -CMISCFLAGS := -std=c99 -march=armv8-a+fp+simd -ftree-vectorize -O3 -mcpu=cortex-a57 -mtune=cortex-a57 #-mtune=cortex-a57 -march=armv8-a -mfloat-abi=hard -mfpu=neon +CPPROCFLAGS := -D_GNU_SOURCE +CMISCFLAGS := -std=c99 -march=armv8-a+fp+simd -ftree-vectorize -O3 -fopenmp -mcpu=cortex-a57.cortex-a53 -mtune=cortex-a57.cortex-a53 CPICFLAGS := -fPIC -CDBGFLAGS := -g +CDBGFLAGS := -g #-g3 -gdwarf-2 CWARNFLAGS := -Wall -COPTFLAGS := -march=armv8-a+fp+simd -ftree-vectorize -O3 -mcpu=cortex-a57 -mtune=cortex-a57 #-march=armv8-a -O2 -mtune=cortex-a57 -mfpu=neon-fp-armv8 #-mfpu=neon -O2 +COPTFLAGS := -march=armv8-a+fp+simd -ftree-vectorize -O3 -mcpu=cortex-a57.cortex-a53 -mtune=cortex-a57.cortex-a53 CKOPTFLAGS := $(COPTFLAGS) -CVECFLAGS := -march=armv8-a+fp+simd -ftree-vectorize -O3 -mcpu=cortex-a57 -mtune=cortex-a57 #-march=armv8-a -O2 -mtune=cortex-a57 -mfpu=neon-fp-armv8 +CVECFLAGS := #-march=armv8-a+fp+simd -ftree-vectorize -O3 -mcpu=cortex-a57.cortex-a53 -mtune=cortex-a57.cortex-a53 #-march=armv8-a -O2 -mtune=cortex-a57 -mfpu=neon-fp-armv8 # Aggregate all of the flags into multiple groups: one for standard # compilation, and one for each of the supported "special" compilation @@ -105,7 +103,7 @@ ARFLAGS := cru # --- Determine the linker and related flags --- LINKER := $(CC) SOFLAGS := -shared -LDFLAGS := -lm +LDFLAGS := -lm -fopenmp diff --git a/kernels/armv8a/neon/3/bli_gemm_opt_4x4.c b/kernels/armv8a/neon/3/bli_gemm_opt_4x4.c index ba98d8dd3..2a54fe825 100644 --- a/kernels/armv8a/neon/3/bli_gemm_opt_4x4.c +++ b/kernels/armv8a/neon/3/bli_gemm_opt_4x4.c @@ -1,4 +1,4 @@ -/* + /* BLIS An object-based framework for developing high-performance BLAS-like @@ -36,12 +36,7 @@ #include "blis.h" /* - o 4x4 Single precision micro-kernel fully functional. - o Runnable on ARMv8, compiled with aarch64 GCC. - o Use it together with the armv8 BLIS configuration. o Tested on Juno board. Around 7.3 GFLOPS @ 1.1 GHz. - - December 2014. */ void bli_sgemm_opt_4x4( dim_t k, @@ -55,15 +50,15 @@ void bli_sgemm_opt_4x4( { void* a_next = bli_auxinfo_next_a( data ); void* b_next = bli_auxinfo_next_b( data ); - + dim_t k_iter = k / 4; - dim_t k_left = k % 4; + dim_t k_left = k % 4; -__asm__ volatile +__asm__ volatile ( " \n\t" " \n\t" -" ldr x0,%[aaddr] \n\t" // Load address of A. +" ldr x0,%[aaddr] \n\t" // Load address of A. " ldr x1,%[baddr] \n\t" // Load address of B. " ldr x2,%[caddr] \n\t" // Load address of C. " \n\t" @@ -433,12 +428,8 @@ __asm__ volatile /* - o 4x4 Double precision micro-kernel NOT fully functional yet. - o Runnable on ARMv8, compiled with aarch64 GCC. - o Use it together with the armv8 BLIS configuration. - o Tested on Juno board. Around 3 GFLOPS @ 1.1 GHz. - - December 2014. + * Tested on Juno board. Around 5.6 GFLOPS, 2 A57 cores @ 1.1 GHz. + * Tested on Juno board. Around 4 GFLOPS, 4 A53 cores @ 850 MHz. */ void bli_dgemm_opt_4x4( dim_t k, @@ -458,8 +449,8 @@ void bli_dgemm_opt_4x4( __asm__ volatile ( -" \n\t" -" ldr x0,%[aaddr] \n\t" // Load address of A +" \n\t" +" ldr x0,%[aaddr] \n\t" // Load address of A " ldr x1,%[baddr] \n\t" // Load address of B " ldr x2,%[caddr] \n\t" // Load address of C " \n\t" @@ -476,9 +467,16 @@ __asm__ volatile " \n\t" " ldr x9,%[cs_c] \n\t" // Load cs_c " lsl x10,x9,#3 \n\t" // cs_c * sizeof(double) -" \n\t" -" ldp q0,q1,[x0],32 \n\t" // Load a into quad -" ldp q4,q5,[x1],32 \n\t" // Load b into quad +" lsl x11,x9,#4 \n\t" // 2 * cs_c * sizeof(double) -- AUX. +" lsl x12,x9,#5 \n\t" // 3 * cs_c * sizeof(double) -- AUX. +" \n\t" +" ldr x13,%[rs_c] \n\t" // Load rs_c. +" lsl x14,x13,#3 \n\t" // rs_c * sizeof(double). +" \n\t" +" prfm pldl1keep,[x2,0] \n\t" // Prefetch c. +" prfm pldl1keep,[x2,x10] \n\t" // Prefetch c. +" prfm pldl1keep,[x2,x11] \n\t" // Prefetch c. +" prfm pldl1keep,[x2,x12] \n\t" // Prefetch c. " \n\t" " movi v12.2d,#0 \n\t" // Vector for accummulating column 0 " movi v13.2d,#0 \n\t" // Vector for accummulating column 0 @@ -489,10 +487,33 @@ __asm__ volatile " movi v18.2d,#0 \n\t" // Vector for accummulating column 3 " movi v19.2d,#0 \n\t" // Vector for accummulating column 3 " \n\t" +" movi v20.2d,#0 \n\t" // Vector for accummulating column 0 +" movi v21.2d,#0 \n\t" // Vector for accummulating column 0 +" movi v22.2d,#0 \n\t" // Vector for accummulating column 1 +" movi v23.2d,#0 \n\t" // Vector for accummulating column 1 +" movi v24.2d,#0 \n\t" // Vector for accummulating column 2 +" movi v25.2d,#0 \n\t" // Vector for accummulating column 2 +" movi v26.2d,#0 \n\t" // Vector for accummulating column 3 +" movi v27.2d,#0 \n\t" // Vector for accummulating column 3 +" \n\t" " ld1r {v31.2d},[x8] \n\t" // Load beta " \n\t" +" \n\t" +" cmp x5,#0 \n\t" // If k_iter == 0, jump to k_left. +" beq .DCONSIDERKLEFT \n\t" +" \n\t" +" ldp q0,q1,[x0],32 \n\t" // Load a +" ldp q4,q5,[x1],32 \n\t" // Load b +" \n\t" +" cmp x5,1 \n\t" // If there is just one k_iter, jump to that one. +" beq .DLASTITER \n\t" // (as loop is do-while-like). +" \n\t" +" \n\t" " DLOOP: \n\t" // Body " \n\t" +" prfm pldl1keep,[x0,#1024] \n\t" // Prefetch. +" prfm pldl1keep,[x1,#1024] \n\t" // Prefetch. +" \n\t" " ldp q6,q7,[x1],32 \n\t" // Load b+4 into quad " \n\t" " fmla v12.2d,v0.2d,v4.d[0] \n\t" // Accummulate @@ -525,9 +546,86 @@ __asm__ volatile " fmla v17.2d,v3.2d,v7.d[0] \n\t" // Accummulate " fmla v19.2d,v3.2d,v7.d[1] \n\t" // Accummulate " \n\t" -" add x4,x4,1 \n\t" // i = i+1 -" cmp x4,x5 \n\t" // Continue -" blt DLOOP \n\t" // if i < N +" prfm pldl1keep,[x0,#64] \n\t" // Prefetch. +" prfm pldl1keep,[x1,#64] \n\t" // Prefetch. +" \n\t" +" sub x5,x5,1 \n\t" // i-=1 +" cmp x5,1 \n\t" // Iterate again if we are not in k_iter == 1. +"bne DLOOP \n\t" +" \n\t" +".DLASTITER: \n\t" +" \n\t" +" ldp q6,q7,[x1],32 \n\t" // Load b+4 into quad +" \n\t" +" fmla v12.2d,v0.2d,v4.d[0] \n\t" // Accummulate +" fmla v14.2d,v0.2d,v4.d[1] \n\t" // Accummulate +" \n\t" +" fmla v13.2d,v1.2d,v4.d[0] \n\t" // Accummulate +" fmla v15.2d,v1.2d,v4.d[1] \n\t" // Accummulate +" \n\t" +" ldp q2,q3,[x0],32 \n\t" // Load a+4 into quad +" \n\t" +" fmla v16.2d,v0.2d,v5.d[0] \n\t" // Accummulate +" fmla v18.2d,v0.2d,v5.d[1] \n\t" // Accummulate +" \n\t" +" fmla v17.2d,v1.2d,v5.d[0] \n\t" // Accummulate +" fmla v19.2d,v1.2d,v5.d[1] \n\t" // Accummulate +" \n\t" +" ld1r {v30.2d},[x7] \n\t" // Load alpha. +" \n\t" +" fmla v12.2d,v2.2d,v6.d[0] \n\t" // Accummulate +" fmla v14.2d,v2.2d,v6.d[1] \n\t" // Accummulate +" \n\t" +" fmla v13.2d,v3.2d,v6.d[0] \n\t" // Accummulate +" fmla v15.2d,v3.2d,v6.d[1] \n\t" // Accummulate +" \n\t" +" fmla v16.2d,v2.2d,v7.d[0] \n\t" // Accummulate +" fmla v18.2d,v2.2d,v7.d[1] \n\t" // Accummulate +" \n\t" +" fmla v17.2d,v3.2d,v7.d[0] \n\t" // Accummulate +" fmla v19.2d,v3.2d,v7.d[1] \n\t" // Accummulate +" \n\t" +" .DCONSIDERKLEFT: \n\t" +" cmp x6,0 \n\t" // If k_left == 0, we are done. +" beq .DPOSTACCUM \n\t" // else, we enter the k_left loop. +" \n\t" +".DLOOPKLEFT: \n\t" +" \n\t" +" prfm pldl1keep,[x0,#1024] \n\t" // Prefetch. +" prfm pldl1keep,[x1,#1024] \n\t" // Prefetch. +" \n\t" +" ldp q0,q1,[x0],32 \n\t" // Load a into quad +" ldp q4,q5,[x1],32 \n\t" // Load b into quad +" \n\t" +//" sub x6,x6,1 \n\t" +" \n\t" +" fmla v12.2d,v0.2d,v4.d[0] \n\t" // Accummulate +" fmla v14.2d,v0.2d,v4.d[1] \n\t" // Accummulate +" \n\t" +" fmla v13.2d,v1.2d,v4.d[0] \n\t" // Accummulate +" fmla v15.2d,v1.2d,v4.d[1] \n\t" // Accummulate +" \n\t" +" fmla v16.2d,v0.2d,v5.d[0] \n\t" // Accummulate +" fmla v18.2d,v0.2d,v5.d[1] \n\t" // Accummulate +" \n\t" +" fmla v17.2d,v1.2d,v5.d[0] \n\t" // Accummulate +" fmla v19.2d,v1.2d,v5.d[1] \n\t" // Accummulate +" \n\t" +//" cmp x6,0 \n\t" // Iterate again. +//" bne .DLOOPKLEFT \n\t" // if i!=0. +" \n\t" +" .DPOSTACCUM: \n\t" +" ld1r {v30.2d},[x7] \n\t" // Load alpha. +" \n\t" +" cmp x13,#1 \n\t" // If rs_c != 1 (column-major) +" bne .DGENSTORED \n\t" +" \n\t" +" .DCOLSTORED: \n\t" // C is column-major. +" fcmp d31,#0.0 \n\t" +" beq .DBETAZEROCOLSTORED \n\t" // Taking care of the beta==0 case. +" \n\t" +" ldr x2,%[caddr] \n\t" // Load address of C. +" \n\t" " \n\t" " ldp q0,q1,[x2] \n\t" // Load c into quad and increment by cs_c " add x2,x2,x10 \n\t" @@ -537,38 +635,140 @@ __asm__ volatile " add x2,x2,x10 \n\t" " ldp q6,q7,[x2] \n\t" // Load c into quad and increment by cs_c " \n\t" -" ld1r {v30.2d},[x7] \n\t" // Load alpha " \n\t" -" fmul v0.2d,v0.2d,v31.d[0] \n\t" // Scale by beta -" fmul v1.2d,v1.2d,v31.d[0] \n\t" // Scale by beta -" fmul v2.2d,v2.2d,v31.d[0] \n\t" // Scale by beta -" fmul v3.2d,v3.2d,v31.d[0] \n\t" // Scale by beta -" fmul v4.2d,v4.2d,v31.d[0] \n\t" // Scale by beta -" fmul v5.2d,v5.2d,v31.d[0] \n\t" // Scale by beta -" fmul v6.2d,v6.2d,v31.d[0] \n\t" // Scale by beta -" fmul v7.2d,v7.2d,v31.d[0] \n\t" // Scale by beta +" fmul v20.2d,v0.2d,v31.d[0] \n\t" // Scale by beta +" fmul v21.2d,v1.2d,v31.d[0] \n\t" // Scale by beta +" fmul v22.2d,v2.2d,v31.d[0] \n\t" // Scale by beta +" fmul v23.2d,v3.2d,v31.d[0] \n\t" // Scale by beta +" fmul v24.2d,v4.2d,v31.d[0] \n\t" // Scale by beta +" fmul v25.2d,v5.2d,v31.d[0] \n\t" // Scale by beta +" fmul v26.2d,v6.2d,v31.d[0] \n\t" // Scale by beta +" fmul v27.2d,v7.2d,v31.d[0] \n\t" // Scale by beta " \n\t" " prfm pldl2keep,[x16] \n\t" " prfm pldl2keep,[x17] \n\t" " \n\t" -" fmla v0.2d,v12.2d,v30.d[0] \n\t" // Scale by alpha -" fmla v1.2d,v13.2d,v30.d[0] \n\t" // Scale by alpha -" fmla v2.2d,v14.2d,v30.d[0] \n\t" // Scale by alpha -" fmla v3.2d,v15.2d,v30.d[0] \n\t" // Scale by alpha -" fmla v4.2d,v16.2d,v30.d[0] \n\t" // Scale by alpha -" fmla v5.2d,v17.2d,v30.d[0] \n\t" // Scale by alpha -" fmla v6.2d,v18.2d,v30.d[0] \n\t" // Scale by alpha -" fmla v7.2d,v19.2d,v30.d[0] \n\t" // Scale by alpha +" .DBETAZEROCOLSTORED: \n\t" // If beta==0, we won't read from C (nor scale). " \n\t" " ldr x2,%[caddr] \n\t" // Load address of C " \n\t" -" stp q0,q1,[x2] \n\t" // Store quad into c and increment by cs_c +" fmla v20.2d,v12.2d,v30.d[0] \n\t" // Scale by alpha +" fmla v21.2d,v13.2d,v30.d[0] \n\t" // Scale by alpha +" fmla v22.2d,v14.2d,v30.d[0] \n\t" // Scale by alpha +" fmla v23.2d,v15.2d,v30.d[0] \n\t" // Scale by alpha +" fmla v24.2d,v16.2d,v30.d[0] \n\t" // Scale by alpha +" fmla v25.2d,v17.2d,v30.d[0] \n\t" // Scale by alpha +" fmla v26.2d,v18.2d,v30.d[0] \n\t" // Scale by alpha +" fmla v27.2d,v19.2d,v30.d[0] \n\t" // Scale by alpha +" \n\t" +" stp q20,q21,[x2] \n\t" // Store quad into c and increment by cs_c " add x2,x2,x10 \n\t" -" stp q2,q3,[x2] \n\t" // Store quad into c+4 and increment by cs_c +" stp q22,q23,[x2] \n\t" // Store quad into c+4 and increment by cs_c " add x2,x2,x10 \n\t" -" stp q4,q5,[x2] \n\t" // Store quad into c+8 and increment by cs_c +" stp q24,q25,[x2] \n\t" // Store quad into c+8 and increment by cs_c " add x2,x2,x10 \n\t" -" stp q6,q7,[x2] \n\t" // Store quad into c+16 and increment by cs_c +" stp q26,q27,[x2] \n\t" // Store quad into c+16 and increment by cs_c +" \n\t" +" b .DEND \n\t" // Done (TODO: this obviously needs to be moved down to remove jump). +" \n\t" +" .DGENSTORED: \n\t" // C is general-stride stored. +" \n\t" +" fcmp d31,#0.0 \n\t" +" beq .DBETAZEROGENSTORED \n\t" +" \n\t" // If beta!=0, then we can read from C. +" \n\t" // TODO: this was done fast. Rearrange to remove so many address reloads. +" ldr x2,%[caddr] \n\t" // Load address of C. +" \n\t" +" ld1 {v0.d}[0],[x2],x14 \n\t" // Load c00 into quad and increment by rs_c. +" ld1 {v0.d}[1],[x2],x14 \n\t" // Load c01 into quad and increment by rs_c. +" ld1 {v1.d}[0],[x2],x14 \n\t" // Load c02 into quad and increment by rs_c. +" ld1 {v1.d}[1],[x2],x14 \n\t" // Load c03 into quad and increment by rs_c. +" \n\t" +" ldr x2,%[caddr] \n\t" // Load address of C. +" add x2,x2,x10 \n\t" // c += cs_c. +" \n\t" +" ld1 {v2.d}[0],[x2],x14 \n\t" // Load c10 into quad and increment by rs_c. +" ld1 {v2.d}[1],[x2],x14 \n\t" // Load c11 into quad and increment by rs_c. +" ld1 {v3.d}[0],[x2],x14 \n\t" // Load c12 into quad and increment by rs_c. +" ld1 {v3.d}[1],[x2],x14 \n\t" // Load c13 into quad and increment by rs_c. +" \n\t" +" ldr x2,%[caddr] \n\t" // Load address of C. +" add x2,x2,x10 \n\t" // c += cs_c. +" add x2,x2,x10 \n\t" // c += cs_c. +" \n\t" +" ld1 {v4.d}[0],[x2],x14 \n\t" // Load c10 into quad and increment by rs_c. +" ld1 {v4.d}[1],[x2],x14 \n\t" // Load c11 into quad and increment by rs_c. +" ld1 {v5.d}[0],[x2],x14 \n\t" // Load c12 into quad and increment by rs_c. +" ld1 {v5.d}[1],[x2],x14 \n\t" // Load c13 into quad and increment by rs_c. +" \n\t" +" ldr x2,%[caddr] \n\t" // Load address of C. +" add x2,x2,x10 \n\t" // c += cs_c. +" add x2,x2,x10 \n\t" // c += cs_c. +" add x2,x2,x10 \n\t" // c += cs_c. +" \n\t" +" ld1 {v6.d}[0],[x2],x14 \n\t" // Load c10 into quad and increment by rs_c. +" ld1 {v6.d}[1],[x2],x14 \n\t" // Load c11 into quad and increment by rs_c. +" ld1 {v7.d}[0],[x2],x14 \n\t" // Load c12 into quad and increment by rs_c. +" ld1 {v7.d}[1],[x2],x14 \n\t" // Load c13 into quad and increment by rs_c. +" \n\t" +" prfm pldl1keep,[x16,0] \n\t" // Prefetch. +" prfm pldl1keep,[x17,0] \n\t" // Prefetch. +" \n\t" +" fmul v20.2d,v0.2d,v31.d[0] \n\t" // Scale by beta +" fmul v21.2d,v1.2d,v31.d[0] \n\t" // Scale by beta +" fmul v22.2d,v2.2d,v31.d[0] \n\t" // Scale by beta +" fmul v23.2d,v3.2d,v31.d[0] \n\t" // Scale by beta +" fmul v24.2d,v4.2d,v31.d[0] \n\t" // Scale by beta +" fmul v25.2d,v5.2d,v31.d[0] \n\t" // Scale by beta +" fmul v26.2d,v6.2d,v31.d[0] \n\t" // Scale by beta +" fmul v27.2d,v7.2d,v31.d[0] \n\t" // Scale by beta +" \n\t" +" .DBETAZEROGENSTORED: \n\t" // If beta==0, we cannot read from C (nor scale). +" \n\t" +" fmla v20.2d,v12.2d,v30.d[0] \n\t" // Scale by alpha +" fmla v21.2d,v13.2d,v30.d[0] \n\t" // Scale by alpha +" fmla v22.2d,v14.2d,v30.d[0] \n\t" // Scale by alpha +" fmla v23.2d,v15.2d,v30.d[0] \n\t" // Scale by alpha +" fmla v24.2d,v16.2d,v30.d[0] \n\t" // Scale by alpha +" fmla v25.2d,v17.2d,v30.d[0] \n\t" // Scale by alpha +" fmla v26.2d,v18.2d,v30.d[0] \n\t" // Scale by alpha +" fmla v27.2d,v19.2d,v30.d[0] \n\t" // Scale by alpha +" \n\t" +" ldr x2,%[caddr] \n\t" // Load address of C. +" \n\t" +" st1 {v20.d}[0],[x2],x14 \n\t" // Store c00 into quad and increment by rs_c. +" st1 {v20.d}[1],[x2],x14 \n\t" // Store c01 into quad and increment by rs_c. +" st1 {v21.d}[0],[x2],x14 \n\t" // Store c02 into quad and increment by rs_c. +" st1 {v21.d}[1],[x2],x14 \n\t" // Store c03 into quad and increment by rs_c. +" \n\t" +" ldr x2,%[caddr] \n\t" // Load address of C. +" add x2,x2,x10 \n\t" // c += cs_c. +" \n\t" +" st1 {v22.d}[0],[x2],x14 \n\t" // Store c10 into quad and increment by rs_c. +" st1 {v22.d}[1],[x2],x14 \n\t" // Store c11 into quad and increment by rs_c. +" st1 {v23.d}[0],[x2],x14 \n\t" // Store c12 into quad and increment by rs_c. +" st1 {v23.d}[1],[x2],x14 \n\t" // Store c13 into quad and increment by rs_c. +" \n\t" +" ldr x2,%[caddr] \n\t" // Load address of C. +" add x2,x2,x10 \n\t" // c += cs_c. +" add x2,x2,x10 \n\t" // c += cs_c. +" \n\t" +" st1 {v24.d}[0],[x2],x14 \n\t" // Store c10 into quad and increment by rs_c. +" st1 {v24.d}[1],[x2],x14 \n\t" // Store c11 into quad and increment by rs_c. +" st1 {v25.d}[0],[x2],x14 \n\t" // Store c12 into quad and increment by rs_c. +" st1 {v25.d}[1],[x2],x14 \n\t" // Store c13 into quad and increment by rs_c. +" \n\t" +" ldr x2,%[caddr] \n\t" // Load address of C. +" add x2,x2,x10 \n\t" // c += cs_c. +" add x2,x2,x10 \n\t" // c += cs_c. +" add x2,x2,x10 \n\t" // c += cs_c. +" \n\t" +" st1 {v26.d}[0],[x2],x14 \n\t" // Store c10 into quad and increment by rs_c. +" st1 {v26.d}[1],[x2],x14 \n\t" // Store c11 into quad and increment by rs_c. +" st1 {v27.d}[0],[x2],x14 \n\t" // Store c12 into quad and increment by rs_c. +" st1 {v27.d}[1],[x2],x14 \n\t" // Store c13 into quad and increment by rs_c. +" \n\t" +" .DEND: \n\t" // Done! " \n\t" :// output operands (none) :// input operands @@ -587,13 +787,15 @@ __asm__ volatile "x0","x1","x2", "x4","x5","x6", "x7","x8","x9", - "x10","x11","x12", + "x10","x11","x12","x13","x14","x16","x17", "v0","v1","v2", "v3","v4","v5", "v6","v7","v8", "v9","v10","v11", "v12","v13","v14", "v15","v16","v17","v18","v19", + "v20","v21","v22","v23", + "v24","v25","v26","v27", "v30","v31" );