diff --git a/config/armv8a/bli_kernel.h b/config/armv8a/bli_kernel.h index 3bd7da722..38eaef60d 100644 --- a/config/armv8a/bli_kernel.h +++ b/config/armv8a/bli_kernel.h @@ -51,13 +51,13 @@ // (b) MR (for zero-padding purposes when MR and NR are "swapped") // -#define BLIS_DEFAULT_MC_S 336 -#define BLIS_DEFAULT_KC_S 336 -#define BLIS_DEFAULT_NC_S 4096 +#define BLIS_DEFAULT_MC_S 120 //1536 //336 //416 // 1280 //160 // 160 // 160 //2048 //336 +#define BLIS_DEFAULT_KC_S 640 //1536 //336 //704 //1280 //672 //528 // 856 //2048 //528 +#define BLIS_DEFAULT_NC_S 3072 -#define BLIS_DEFAULT_MC_D 160 -#define BLIS_DEFAULT_KC_D 304 -#define BLIS_DEFAULT_NC_D 4096 +#define BLIS_DEFAULT_MC_D 120 //1536 //160 //80 //176 +#define BLIS_DEFAULT_KC_D 240 //1536 //304 //336 //368 +#define BLIS_DEFAULT_NC_D 3072 #define BLIS_DEFAULT_MC_C 64 #define BLIS_DEFAULT_KC_C 128 @@ -69,11 +69,11 @@ // -- Register blocksizes -- -#define BLIS_DEFAULT_MR_S 4 -#define BLIS_DEFAULT_NR_S 4 +#define BLIS_DEFAULT_MR_S 8 +#define BLIS_DEFAULT_NR_S 12 -#define BLIS_DEFAULT_MR_D 4 -#define BLIS_DEFAULT_NR_D 4 +#define BLIS_DEFAULT_MR_D 6 +#define BLIS_DEFAULT_NR_D 8 #define BLIS_DEFAULT_MR_C 8 #define BLIS_DEFAULT_NR_C 4 @@ -132,6 +132,8 @@ //#define BLIS_PACKDIM_MR_Z (BLIS_DEFAULT_MR_Z + ...) //#define BLIS_PACKDIM_NR_Z (BLIS_DEFAULT_NR_Z + ...) + + // -- LEVEL-2 KERNEL CONSTANTS ------------------------------------------------- @@ -146,8 +148,8 @@ // -- gemm -- -#define BLIS_SGEMM_UKERNEL bli_sgemm_opt_4x4 -#define BLIS_DGEMM_UKERNEL bli_dgemm_opt_4x4 +#define BLIS_SGEMM_UKERNEL bli_sgemm_opt_8x12 +#define BLIS_DGEMM_UKERNEL bli_dgemm_opt_6x8 // -- trsm-related -- diff --git a/kernels/armv8a/neon/3/bli_gemm_opt_4x4.c b/kernels/armv8a/neon/3/bli_gemm_opt_4x4.c index 2a54fe825..e010d188f 100644 --- a/kernels/armv8a/neon/3/bli_gemm_opt_4x4.c +++ b/kernels/armv8a/neon/3/bli_gemm_opt_4x4.c @@ -36,9 +36,21 @@ #include "blis.h" /* + o 4x4 Single precision micro-kernel fully functional. + o Runnable on ARMv8, compiled with aarch64 GCC. + o Use it together with the armv8 BLIS configuration. o Tested on Juno board. Around 7.3 GFLOPS @ 1.1 GHz. + + December 2014. + + * UPDATE NOVEMBER 2015 + * Micro-kernel changed to 8x12 + * Tested on Juno Board. Around 8.1 GFLOPS, 1 x A57 core @ 1.1 GHz. + * Tested on Juno Board. Around 15.9 GFLOPS, 2 x A57 cores @ 1.1 GHz. + * Tested on Juno board. Around 3.1 GFLOPS, 1 x A53 core @ 850 MHz. + * Tested on Juno board. Around 12 GFLOPS, 4 x A53 cores @ 850 MHz. */ -void bli_sgemm_opt_4x4( +void bli_sgemm_opt_8x12( dim_t k, float* restrict alpha, float* restrict a, @@ -50,9 +62,9 @@ void bli_sgemm_opt_4x4( { void* a_next = bli_auxinfo_next_a( data ); void* b_next = bli_auxinfo_next_b( data ); - + dim_t k_iter = k / 4; - dim_t k_left = k % 4; + dim_t k_left = k % 4; __asm__ volatile ( @@ -62,10 +74,8 @@ __asm__ volatile " ldr x1,%[baddr] \n\t" // Load address of B. " ldr x2,%[caddr] \n\t" // Load address of C. " \n\t" -" mov x4,#1 \n\t" // Init loop counter (i=0). -" \n\t" -" ldr x16,%[a_next] \n\t" // Pointer to next block of A. -" ldr x17,%[b_next] \n\t" // Pointer to next pointer of B. +" ldr x3,%[a_next] \n\t" // Pointer to next block of A. +" ldr x4,%[b_next] \n\t" // Pointer to next pointer of B. " \n\t" " ldr x5,%[k_iter] \n\t" // Number of unrolled iterations (k_iter). " ldr x6,%[k_left] \n\t" // Number of remaining iterations (k_left). @@ -75,157 +85,367 @@ __asm__ volatile " \n\t" " ldr x9,%[cs_c] \n\t" // Load cs_c. " lsl x10,x9,#2 \n\t" // cs_c * sizeof(float) -- AUX. -" lsl x11,x9,#3 \n\t" // 2 * cs_c * sizeof(float) -- AUX. -" lsl x12,x9,#4 \n\t" // 3 * cs_c * sizeof(float) -- AUX. " \n\t" " ldr x13,%[rs_c] \n\t" // Load rs_c. " lsl x14,x13,#2 \n\t" // rs_c * sizeof(float). -" \n\t" -" ldp q0,q1,[x0,0] \n\t" // Preload columns a,a+1 into two quads. -" ldp q4,q5,[x1,0] \n\t" // Preload rows b,b+1 into two quads. " \n\t" -" prfm pldl1keep,[x2,0] \n\t" // Prefetch c. -" prfm pldl1keep,[x2,x10] \n\t" // Prefetch c. -" prfm pldl1keep,[x2,x11] \n\t" // Prefetch c. -" prfm pldl1keep,[x2,x12] \n\t" // Prefetch c. +" add x16,x2,x10 \n\t" //Load address Column 1 of C +" add x17,x16,x10 \n\t" //Load address Column 2 of C +" add x18,x17,x10 \n\t" //Load address Column 3 of C +" add x19,x18,x10 \n\t" //Load address Column 4 of C +" add x20,x19,x10 \n\t" //Load address Column 5 of C +" add x21,x20,x10 \n\t" //Load address Column 6 of C +" add x22,x21,x10 \n\t" //Load address Column 7 of C +" add x23,x22,x10 \n\t" //Load address Column 8 of C +" add x24,x23,x10 \n\t" //Load address Column 9 of C +" add x25,x24,x10 \n\t" //Load address Column 10 of C +" add x26,x25,x10 \n\t" //Load address Column 11 of C " \n\t" -" \n\t" // Vectors for result columns. -" movi v8.4s,#0 \n\t" // Vector for result column 0. -" movi v9.4s,#0 \n\t" // Vector for result column 1. -" movi v10.4s,#0 \n\t" // Vector for result column 2. -" movi v11.4s,#0 \n\t" // Vector for result column 3. +" ldr q0, [x0] \n\t" +" ldr q1, [x0, #16] \n\t" // Load a " \n\t" -" \n\t" // Replicating accum. vectors for unrolling. -" movi v12.4s,#0 \n\t" // Vector 1 for accummulating column 0. -" movi v13.4s,#0 \n\t" // Vector 1 for accummulating column 1. -" movi v14.4s,#0 \n\t" // Vector 1 for accummulating column 2. -" movi v15.4s,#0 \n\t" // Vector 1 for accummulating column 3. +" ldr q2, [x1] \n\t" // Load b +" ldr q3, [x1, #16] \n\t" +" ldr q4, [x1, #32] \n\t" " \n\t" -" movi v16.4s,#0 \n\t" // Vector 2 for accummulating column 0. -" movi v17.4s,#0 \n\t" // Vector 2 for accummulating column 1. -" movi v18.4s,#0 \n\t" // Vector 2 for accummulating column 2. -" movi v19.4s,#0 \n\t" // Vector 2 for accummulating column 3. +" prfm pldl1keep,[x2] \n\t" // Prefetch c. +" prfm pldl1keep,[x16] \n\t" // Prefetch c. +" prfm pldl1keep,[x17] \n\t" // Prefetch c. +" prfm pldl1keep,[x18] \n\t" // Prefetch c. +" prfm pldl1keep,[x19] \n\t" // Prefetch c. +" prfm pldl1keep,[x20] \n\t" // Prefetch c. +" prfm pldl1keep,[x21] \n\t" // Prefetch c. +" prfm pldl1keep,[x22] \n\t" // Prefetch c. +" prfm pldl1keep,[x23] \n\t" // Prefetch c. +" prfm pldl1keep,[x24] \n\t" // Prefetch c. +" prfm pldl1keep,[x25] \n\t" // Prefetch c. +" prfm pldl1keep,[x26] \n\t" // Prefetch c. " \n\t" -" movi v20.4s,#0 \n\t" // Vector 3 for accummulating column 0. -" movi v21.4s,#0 \n\t" // Vector 3 for accummulating column 1. -" movi v22.4s,#0 \n\t" // Vector 3 for accummulating column 2. -" movi v23.4s,#0 \n\t" // Vector 3 for accummulating column 3. +" dup v8.4s, wzr \n\t" // Vector for accummulating column 0 +" prfm PLDL1KEEP, [x1, #192] \n\t" +" dup v9.4s, wzr \n\t" // Vector for accummulating column 0 +" prfm PLDL1KEEP, [x1, #256] \n\t" +" dup v10.4s, wzr \n\t" // Vector for accummulating column 1 +" prfm PLDL1KEEP, [x1, #320] \n\t" +" dup v11.4s, wzr \n\t" // Vector for accummulating column 1 +" dup v12.4s, wzr \n\t" // Vector for accummulating column 2 +" dup v13.4s, wzr \n\t" // Vector for accummulating column 2 " \n\t" -" movi v24.4s,#0 \n\t" // Vector 4 for accummulating column 0. -" movi v25.4s,#0 \n\t" // Vector 4 for accummulating column 1. -" movi v26.4s,#0 \n\t" // Vector 4 for accummulating column 2. -" movi v27.4s,#0 \n\t" // Vector 4 for accummulating column 3. +" dup v14.4s, wzr \n\t" // Vector for accummulating column 3 +" prfm PLDL1KEEP, [x0, #128] \n\t" +" dup v15.4s, wzr \n\t" // Vector for accummulating column 3 +" prfm PLDL1KEEP, [x0, #192] \n\t" +" dup v16.4s, wzr \n\t" // Vector for accummulating column 4 +" dup v17.4s, wzr \n\t" // Vector for accummulating column 4 +" dup v18.4s, wzr \n\t" // Vector for accummulating column 5 +" dup v19.4s, wzr \n\t" // Vector for accummulating column 5 " \n\t" -" ld1r {v31.4s},[x8] \n\t" // Load beta into quad. +" dup v20.4s, wzr \n\t" // Vector for accummulating column 6 +" dup v21.4s, wzr \n\t" // Vector for accummulating column 6 +" dup v22.4s, wzr \n\t" // Vector for accummulating column 7 +" dup v23.4s, wzr \n\t" // Vector for accummulating column 7 +" dup v24.4s, wzr \n\t" // Vector for accummulating column 8 +" dup v25.4s, wzr \n\t" // Vector for accummulating column 8 +" \n\t" +" dup v26.4s, wzr \n\t" // Vector for accummulating column 9 +" dup v27.4s, wzr \n\t" // Vector for accummulating column 9 +" dup v28.4s, wzr \n\t" // Vector for accummulating column 10 +" dup v29.4s, wzr \n\t" // Vector for accummulating column 10 +" dup v30.4s, wzr \n\t" // Vector for accummulating column 11 +" dup v31.4s, wzr \n\t" // Vector for accummulating column 11 " \n\t" " cmp x5,#0 \n\t" // If k_iter == 0, jump to k_left. " beq .SCONSIDERKLEFT \n\t" " \n\t" +"add x0, x0, #32 \n\t" //update address of A +"add x1, x1, #48 \n\t" //update address of B +" \n\t" " cmp x5,1 \n\t" // If there is just one k_iter, jump to that one. " beq .SLASTITER \n\t" // (as loop is do-while-like). " \n\t" " .SLOOPKITER: \n\t" // Body of the k_iter loop. " \n\t" -" prfm pldl1keep,[x0,#1024] \n\t" // Prefetch. -" prfm pldl1keep,[x1,#1024] \n\t" // Prefetch. +" ldr q5, [x0] \n\t" +" fmla v8.4s, v0.4s,v2.s[0] \n\t" // Accummulate. +" fmla v9.4s, v1.4s,v2.s[0] \n\t" // Accummulate. +" ldr q6, [x0, #16] \n\t" +" fmla v10.4s,v0.4s,v2.s[1] \n\t" // Accummulate. +" fmla v11.4s,v1.4s,v2.s[1] \n\t" // Accummulate. +" fmla v12.4s,v0.4s,v2.s[2] \n\t" // Accummulate. +" fmla v13.4s,v1.4s,v2.s[2] \n\t" // Accummulate. +" fmla v14.4s,v0.4s,v2.s[3] \n\t" // Accummulate. +" fmla v15.4s,v1.4s,v2.s[3] \n\t" // Accummulate. +" ldr q2, [x1] \n\t" " \n\t" -" fmla v12.4s,v0.4s,v4.s[0] \n\t" // Accummulate. -" fmla v13.4s,v0.4s,v4.s[1] \n\t" // Accummulate. +" fmla v16.4s,v0.4s,v3.s[0] \n\t" // Accummulate. +" prfm PLDL1KEEP, [x1, #336] \n\t" +" fmla v17.4s,v1.4s,v3.s[0] \n\t" // Accummulate. +" prfm PLDL1KEEP, [x1, #400] \n\t" +" fmla v18.4s,v0.4s,v3.s[1] \n\t" // Accummulate. +" fmla v19.4s,v1.4s,v3.s[1] \n\t" // Accummulate. +" prfm PLDL1KEEP, [x1, #464] \n\t" +" fmla v20.4s,v0.4s,v3.s[2] \n\t" // Accummulate. +" fmla v21.4s,v1.4s,v3.s[2] \n\t" // Accummulate. +" fmla v22.4s,v0.4s,v3.s[3] \n\t" // Accummulate. +" fmla v23.4s,v1.4s,v3.s[3] \n\t" // Accummulate. " \n\t" -" ldp q6,q7,[x1,32] \n\t" // Load rows b+2,b+3 into quads. +" fmla v24.4s,v0.4s,v4.s[0] \n\t" // Accummulate. +" fmla v26.4s,v0.4s,v4.s[1] \n\t" // Accummulate. +" fmla v28.4s,v0.4s,v4.s[2] \n\t" // Accummulate. +" fmla v30.4s,v0.4s,v4.s[3] \n\t" // Accummulate. +" ldr q3, [x1, #16] \n\t" " \n\t" -" fmla v14.4s,v0.4s,v4.s[2] \n\t" // Accummulate. -" fmla v15.4s,v0.4s,v4.s[3] \n\t" // Accummulate. +" fmla v25.4s,v1.4s,v4.s[0] \n\t" // Accummulate. +" fmla v27.4s,v1.4s,v4.s[1] \n\t" // Accummulate. +" fmla v29.4s,v1.4s,v4.s[2] \n\t" // Accummulate. +" fmla v31.4s,v1.4s,v4.s[3] \n\t" // Accummulate. +" ldr q4, [x1, #32] \n\t" +" \n\t" //End It 1 " \n\t" -" ldp q2,q3,[x0,32] \n\t" // Load columns a+2,a+3 into quads. +" ldr q0, [x0, #32] \n\t" +" fmla v8.4s,v5.4s,v2.s[0] \n\t" // Accummulate. +" fmla v9.4s,v6.4s,v2.s[0] \n\t" // Accummulate. +" ldr q1, [x0, #48] \n\t" +" fmla v10.4s,v5.4s,v2.s[1] \n\t" // Accummulate. +" fmla v11.4s,v6.4s,v2.s[1] \n\t" // Accummulate. +" fmla v12.4s,v5.4s,v2.s[2] \n\t" // Accummulate. +" fmla v13.4s,v6.4s,v2.s[2] \n\t" // Accummulate. +" fmla v14.4s,v5.4s,v2.s[3] \n\t" // Accummulate. +" fmla v15.4s,v6.4s,v2.s[3] \n\t" // Accummulate. +" ldr q2, [x1, #48] \n\t" " \n\t" -" fmla v16.4s,v1.4s,v5.s[0] \n\t" // Accummulate. -" fmla v17.4s,v1.4s,v5.s[1] \n\t" // Accummulate. +" fmla v16.4s,v5.4s,v3.s[0] \n\t" // Accummulate. +" prfm PLDL1KEEP, [x0, #224] \n\t" +" fmla v17.4s,v6.4s,v3.s[0] \n\t" // Accummulate. +" prfm PLDL1KEEP, [x0, #288] \n\t" +" fmla v18.4s,v5.4s,v3.s[1] \n\t" // Accummulate. +" fmla v19.4s,v6.4s,v3.s[1] \n\t" // Accummulate. +" fmla v20.4s,v5.4s,v3.s[2] \n\t" // Accummulate. +" fmla v21.4s,v6.4s,v3.s[2] \n\t" // Accummulate. +" fmla v22.4s,v5.4s,v3.s[3] \n\t" // Accummulate. +" fmla v23.4s,v6.4s,v3.s[3] \n\t" // Accummulate. " \n\t" -" fmla v18.4s,v1.4s,v5.s[2] \n\t" // Accummulate. -" fmla v19.4s,v1.4s,v5.s[3] \n\t" // Accummulate. +" fmla v24.4s,v5.4s,v4.s[0] \n\t" // Accummulate. +" fmla v26.4s,v5.4s,v4.s[1] \n\t" // Accummulate. +" fmla v28.4s,v5.4s,v4.s[2] \n\t" // Accummulate. +" fmla v30.4s,v5.4s,v4.s[3] \n\t" // Accummulate. +" ldr q3, [x1, #64] \n\t" " \n\t" -" add x0,x0,64 \n\t" // Update a_ptr. -" add x1,x1,64 \n\t" // Update b_ptr. +" fmla v25.4s,v6.4s,v4.s[0] \n\t" // Accummulate. +" fmla v27.4s,v6.4s,v4.s[1] \n\t" // Accummulate. +" fmla v29.4s,v6.4s,v4.s[2] \n\t" // Accummulate. +" fmla v31.4s,v6.4s,v4.s[3] \n\t" // Accummulate. +" ldr q4, [x1, #80] \n\t" +" \n\t" //End It 2 " \n\t" -" fmla v20.4s,v2.4s,v6.s[0] \n\t" // Accummulate. -" fmla v21.4s,v2.4s,v6.s[1] \n\t" // Accummulate. +" ldr q5, [x0, #64] \n\t" +" fmla v8.4s,v0.4s,v2.s[0] \n\t" // Accummulate. +" fmla v9.4s,v1.4s,v2.s[0] \n\t" // Accummulate. +" ldr q6, [x0, #80] \n\t" +" fmla v10.4s,v0.4s,v2.s[1] \n\t" // Accummulate. +" fmla v11.4s,v1.4s,v2.s[1] \n\t" // Accummulate. +" fmla v12.4s,v0.4s,v2.s[2] \n\t" // Accummulate. +" fmla v13.4s,v1.4s,v2.s[2] \n\t" // Accummulate. +" fmla v14.4s,v0.4s,v2.s[3] \n\t" // Accummulate. +" fmla v15.4s,v1.4s,v2.s[3] \n\t" // Accummulate. +" ldr q2, [x1, #96] \n\t" " \n\t" -" ldp q0,q1,[x0] \n\t" // Load columns a,a+1 into quads (next iteration). +" fmla v16.4s,v0.4s,v3.s[0] \n\t" // Accummulate. +" fmla v17.4s,v1.4s,v3.s[0] \n\t" // Accummulate. +" fmla v18.4s,v0.4s,v3.s[1] \n\t" // Accummulate. +" fmla v19.4s,v1.4s,v3.s[1] \n\t" // Accummulate. +" fmla v20.4s,v0.4s,v3.s[2] \n\t" // Accummulate. +" fmla v21.4s,v1.4s,v3.s[2] \n\t" // Accummulate. +" fmla v22.4s,v0.4s,v3.s[3] \n\t" // Accummulate. +" fmla v23.4s,v1.4s,v3.s[3] \n\t" // Accummulate. " \n\t" -" fmla v22.4s,v2.4s,v6.s[2] \n\t" // Accummulate. -" fmla v23.4s,v2.4s,v6.s[3] \n\t" // Accummulate. +" fmla v24.4s,v0.4s,v4.s[0] \n\t" // Accummulate. +" fmla v26.4s,v0.4s,v4.s[1] \n\t" // Accummulate. +" fmla v28.4s,v0.4s,v4.s[2] \n\t" // Accummulate. +" fmla v30.4s,v0.4s,v4.s[3] \n\t" // Accummulate. +" ldr q3, [x1, #112] \n\t" " \n\t" -" ldp q4,q5,[x1] \n\t" // Load rows b,b+1 into quads (next iteration). +" fmla v25.4s,v1.4s,v4.s[0] \n\t" // Accummulate. +" fmla v27.4s,v1.4s,v4.s[1] \n\t" // Accummulate. +" fmla v29.4s,v1.4s,v4.s[2] \n\t" // Accummulate. +" fmla v31.4s,v1.4s,v4.s[3] \n\t" // Accummulate. +" ldr q4, [x1, #128] \n\t" +" \n\t" //End It 3 " \n\t" -" fmla v24.4s,v3.4s,v7.s[0] \n\t" // Accummulate. -" fmla v25.4s,v3.4s,v7.s[1] \n\t" // Accummulate. +" ldr q0, [x0, #96] \n\t" +" fmla v8.4s,v5.4s,v2.s[0] \n\t" // Accummulate. +" fmla v9.4s,v6.4s,v2.s[0] \n\t" // Accummulate. +" ldr q1, [x0, #112] \n\t" +" fmla v10.4s,v5.4s,v2.s[1] \n\t" // Accummulate. +" fmla v11.4s,v6.4s,v2.s[1] \n\t" // Accummulate. +" fmla v12.4s,v5.4s,v2.s[2] \n\t" // Accummulate. +" fmla v13.4s,v6.4s,v2.s[2] \n\t" // Accummulate. +" fmla v14.4s,v5.4s,v2.s[3] \n\t" // Accummulate. +" fmla v15.4s,v6.4s,v2.s[3] \n\t" // Accummulate. +" ldr q2, [x1, #144] \n\t" " \n\t" -" prfm pldl1keep,[x0,#64] \n\t" // Prefetch. -" prfm pldl1keep,[x1,#64] \n\t" // Prefetch. +" fmla v16.4s,v5.4s,v3.s[0] \n\t" // Accummulate. +" fmla v17.4s,v6.4s,v3.s[0] \n\t" // Accummulate. +" fmla v18.4s,v5.4s,v3.s[1] \n\t" // Accummulate. +" fmla v19.4s,v6.4s,v3.s[1] \n\t" // Accummulate. +" fmla v20.4s,v5.4s,v3.s[2] \n\t" // Accummulate. +" fmla v21.4s,v6.4s,v3.s[2] \n\t" // Accummulate. +" fmla v22.4s,v5.4s,v3.s[3] \n\t" // Accummulate. +" fmla v23.4s,v6.4s,v3.s[3] \n\t" // Accummulate. " \n\t" -" fmla v26.4s,v3.4s,v7.s[2] \n\t" // Accummulate. -" fmla v27.4s,v3.4s,v7.s[3] \n\t" // Accummulate. +" fmla v24.4s,v5.4s,v4.s[0] \n\t" // Accummulate. +" fmla v26.4s,v5.4s,v4.s[1] \n\t" // Accummulate. +" fmla v28.4s,v5.4s,v4.s[2] \n\t" // Accummulate. +" fmla v30.4s,v5.4s,v4.s[3] \n\t" // Accummulate. +" ldr q3, [x1, #160] \n\t" " \n\t" +" fmla v25.4s,v6.4s,v4.s[0] \n\t" // Accummulate. +" fmla v27.4s,v6.4s,v4.s[1] \n\t" // Accummulate. +" fmla v29.4s,v6.4s,v4.s[2] \n\t" // Accummulate. +" fmla v31.4s,v6.4s,v4.s[3] \n\t" // Accummulate. +" ldr q4, [x1, #176] \n\t" +" add x1, x1, #192 \n\t" +" add x0, x0, #128 \n\t" +" \n\t" //End It 4 " sub x5,x5,1 \n\t" // i-=1. " cmp x5,1 \n\t" // Iterate again if we are not in k_iter == 1. " bne .SLOOPKITER \n\t" " \n\t" -//" prfm pldl1keep,[x0,#1024] \n\t" -//" prfm pldl1keep,[x1,#1024] \n\t" -" \n\t" " .SLASTITER: \n\t" // Last iteration of k_iter loop. " \n\t" -" fmla v12.4s,v0.4s,v4.s[0] \n\t" // Accummulate. -" fmla v13.4s,v0.4s,v4.s[1] \n\t" // Accummulate. " \n\t" -" ldp q6,q7,[x1,32] \n\t" // Load rows b+2,b+3 into quads. +" ldr q5, [x0] \n\t" +" fmla v8.4s,v0.4s,v2.s[0] \n\t" // Accummulate. +" fmla v9.4s,v1.4s,v2.s[0] \n\t" // Accummulate. +" ldr q6, [x0, #16] \n\t" +" fmla v10.4s,v0.4s,v2.s[1] \n\t" // Accummulate. +" fmla v11.4s,v1.4s,v2.s[1] \n\t" // Accummulate. +" fmla v12.4s,v0.4s,v2.s[2] \n\t" // Accummulate. +" fmla v13.4s,v1.4s,v2.s[2] \n\t" // Accummulate. +" fmla v14.4s,v0.4s,v2.s[3] \n\t" // Accummulate. +" fmla v15.4s,v1.4s,v2.s[3] \n\t" // Accummulate. +" ldr q2, [x1] \n\t" " \n\t" -" fmla v14.4s,v0.4s,v4.s[2] \n\t" // Accummulate. -" fmla v15.4s,v0.4s,v4.s[3] \n\t" // Accummulate. +" fmla v16.4s,v0.4s,v3.s[0] \n\t" // Accummulate. +" fmla v17.4s,v1.4s,v3.s[0] \n\t" // Accummulate. +" fmla v18.4s,v0.4s,v3.s[1] \n\t" // Accummulate. +" fmla v19.4s,v1.4s,v3.s[1] \n\t" // Accummulate. +" fmla v20.4s,v0.4s,v3.s[2] \n\t" // Accummulate. +" fmla v21.4s,v1.4s,v3.s[2] \n\t" // Accummulate. +" fmla v22.4s,v0.4s,v3.s[3] \n\t" // Accummulate. +" fmla v23.4s,v1.4s,v3.s[3] \n\t" // Accummulate. " \n\t" -" ldp q2,q3,[x0,32] \n\t" // Load columns a+2,a+3 into quads. +" fmla v24.4s,v0.4s,v4.s[0] \n\t" // Accummulate. +" fmla v26.4s,v0.4s,v4.s[1] \n\t" // Accummulate. +" fmla v28.4s,v0.4s,v4.s[2] \n\t" // Accummulate. +" fmla v30.4s,v0.4s,v4.s[3] \n\t" // Accummulate. +" ldr q3, [x1, #16] \n\t" " \n\t" -" fmla v16.4s,v1.4s,v5.s[0] \n\t" // Accummulate. -" fmla v17.4s,v1.4s,v5.s[1] \n\t" // Accummulate. +" fmla v25.4s,v1.4s,v4.s[0] \n\t" // Accummulate. +" fmla v27.4s,v1.4s,v4.s[1] \n\t" // Accummulate. +" fmla v29.4s,v1.4s,v4.s[2] \n\t" // Accummulate. +" fmla v31.4s,v1.4s,v4.s[3] \n\t" // Accummulate. +" ldr q4, [x1, #32] \n\t" +" \n\t" //End It 1 " \n\t" -" ld1r {v30.4s},[x7] \n\t" // Load alpha. +" ldr q0, [x0, #32] \n\t" +" fmla v8.4s,v5.4s,v2.s[0] \n\t" // Accummulate. +" fmla v9.4s,v6.4s,v2.s[0] \n\t" // Accummulate. +" ldr q1, [x0, #48] \n\t" +" fmla v10.4s,v5.4s,v2.s[1] \n\t" // Accummulate. +" fmla v11.4s,v6.4s,v2.s[1] \n\t" // Accummulate. +" fmla v12.4s,v5.4s,v2.s[2] \n\t" // Accummulate. +" fmla v13.4s,v6.4s,v2.s[2] \n\t" // Accummulate. +" fmla v14.4s,v5.4s,v2.s[3] \n\t" // Accummulate. +" fmla v15.4s,v6.4s,v2.s[3] \n\t" // Accummulate. +" ldr q2, [x1, #48] \n\t" " \n\t" -" fmla v18.4s,v1.4s,v5.s[2] \n\t" // Accummulate. -" fmla v19.4s,v1.4s,v5.s[3] \n\t" // Accummulate. +" fmla v16.4s,v5.4s,v3.s[0] \n\t" // Accummulate. +" fmla v17.4s,v6.4s,v3.s[0] \n\t" // Accummulate. +" fmla v18.4s,v5.4s,v3.s[1] \n\t" // Accummulate. +" fmla v19.4s,v6.4s,v3.s[1] \n\t" // Accummulate. +" fmla v20.4s,v5.4s,v3.s[2] \n\t" // Accummulate. +" fmla v21.4s,v6.4s,v3.s[2] \n\t" // Accummulate. +" fmla v22.4s,v5.4s,v3.s[3] \n\t" // Accummulate. +" fmla v23.4s,v6.4s,v3.s[3] \n\t" // Accummulate. " \n\t" -" fmla v20.4s,v2.4s,v6.s[0] \n\t" // Accummulate. -" fmla v21.4s,v2.4s,v6.s[1] \n\t" // Accummulate. +" fmla v24.4s,v5.4s,v4.s[0] \n\t" // Accummulate. +" fmla v26.4s,v5.4s,v4.s[1] \n\t" // Accummulate. +" fmla v28.4s,v5.4s,v4.s[2] \n\t" // Accummulate. +" fmla v30.4s,v5.4s,v4.s[3] \n\t" // Accummulate. +" ldr q3, [x1, #64] \n\t" " \n\t" -" fmla v22.4s,v2.4s,v6.s[2] \n\t" // Accummulate. -" fmla v23.4s,v2.4s,v6.s[3] \n\t" // Accummulate. +" fmla v25.4s,v6.4s,v4.s[0] \n\t" // Accummulate. +" fmla v27.4s,v6.4s,v4.s[1] \n\t" // Accummulate. +" fmla v29.4s,v6.4s,v4.s[2] \n\t" // Accummulate. +" fmla v31.4s,v6.4s,v4.s[3] \n\t" // Accummulate. +" ldr q4, [x1, #80] \n\t" +" \n\t" //End It 2 " \n\t" -" fmla v24.4s,v3.4s,v7.s[0] \n\t" // Accummulate. -" fmla v25.4s,v3.4s,v7.s[1] \n\t" // Accummulate. +" ldr q5, [x0, #64] \n\t" +" fmla v8.4s,v0.4s,v2.s[0] \n\t" // Accummulate. +" fmla v9.4s,v1.4s,v2.s[0] \n\t" // Accummulate. +" ldr q6, [x0, #80] \n\t" +" fmla v10.4s,v0.4s,v2.s[1] \n\t" // Accummulate. +" fmla v11.4s,v1.4s,v2.s[1] \n\t" // Accummulate. +" fmla v12.4s,v0.4s,v2.s[2] \n\t" // Accummulate. +" fmla v13.4s,v1.4s,v2.s[2] \n\t" // Accummulate. +" fmla v14.4s,v0.4s,v2.s[3] \n\t" // Accummulate. +" fmla v15.4s,v1.4s,v2.s[3] \n\t" // Accummulate. +" ldr q2, [x1, #96] \n\t" " \n\t" -" fmla v26.4s,v3.4s,v7.s[2] \n\t" // Accummulate. -" fmla v27.4s,v3.4s,v7.s[3] \n\t" // Accummulate. +" fmla v16.4s,v0.4s,v3.s[0] \n\t" // Accummulate. +" fmla v17.4s,v1.4s,v3.s[0] \n\t" // Accummulate. +" fmla v18.4s,v0.4s,v3.s[1] \n\t" // Accummulate. +" fmla v19.4s,v1.4s,v3.s[1] \n\t" // Accummulate. +" fmla v20.4s,v0.4s,v3.s[2] \n\t" // Accummulate. +" fmla v21.4s,v1.4s,v3.s[2] \n\t" // Accummulate. +" fmla v22.4s,v0.4s,v3.s[3] \n\t" // Accummulate. +" fmla v23.4s,v1.4s,v3.s[3] \n\t" // Accummulate. " \n\t" -//" ld1 {v8.4s},[x2],x10 \n\t" // Load c into quad and increment by cs_c -//" ld1 {v9.4s},[x2],x10 \n\t" // Load c+4 into quad and increment by cs_c -//" ld1 {v10.4s},[x2],x10 \n\t" // Load c+8 into quad and increment by cs_c -//" ld1 {v11.4s},[x2],x10 \n\t" // Load c+16 into quad and increment by cs_c +" fmla v24.4s,v0.4s,v4.s[0] \n\t" // Accummulate. +" fmla v26.4s,v0.4s,v4.s[1] \n\t" // Accummulate. +" fmla v28.4s,v0.4s,v4.s[2] \n\t" // Accummulate. +" fmla v30.4s,v0.4s,v4.s[3] \n\t" // Accummulate. +" ldr q3, [x1, #112] \n\t" " \n\t" -" fadd v12.4s,v12.4s,v16.4s \n\t" // Final accummulate of temporal accum. vectors. -" fadd v13.4s,v13.4s,v17.4s \n\t" // Final accummulate of temporal accum. vectors. -" fadd v14.4s,v14.4s,v18.4s \n\t" // Final accummulate of temporal accum. vectors. -" fadd v15.4s,v15.4s,v19.4s \n\t" // Final accummulate of temporal accum. vectors. -" fadd v12.4s,v12.4s,v20.4s \n\t" // Final accummulate of temporal accum. vectors. -" fadd v13.4s,v13.4s,v21.4s \n\t" // Final accummulate of temporal accum. vectors. -" fadd v14.4s,v14.4s,v22.4s \n\t" // Final accummulate of temporal accum. vectors. -" fadd v15.4s,v15.4s,v23.4s \n\t" // Final accummulate of temporal accum. vectors. -" fadd v12.4s,v12.4s,v24.4s \n\t" // Final accummulate of temporal accum. vectors. -" fadd v13.4s,v13.4s,v25.4s \n\t" // Final accummulate of temporal accum. vectors. -" fadd v14.4s,v14.4s,v26.4s \n\t" // Final accummulate of temporal accum. vectors. -" fadd v15.4s,v15.4s,v27.4s \n\t" // Final accummulate of temporal accum. vectors. +" fmla v25.4s,v1.4s,v4.s[0] \n\t" // Accummulate. +" fmla v27.4s,v1.4s,v4.s[1] \n\t" // Accummulate. +" fmla v29.4s,v1.4s,v4.s[2] \n\t" // Accummulate. +" fmla v31.4s,v1.4s,v4.s[3] \n\t" // Accummulate. +" ldr q4, [x1, #128] \n\t" +" \n\t" //End It 3 " \n\t" -" add x0,x0,64 \n\t" // Update a_ptr. -" add x1,x1,64 \n\t" // Update b_ptr. +" fmla v8.4s,v5.4s,v2.s[0] \n\t" // Accummulate. +" fmla v9.4s,v6.4s,v2.s[0] \n\t" // Accummulate. +" fmla v10.4s,v5.4s,v2.s[1] \n\t" // Accummulate. +" fmla v11.4s,v6.4s,v2.s[1] \n\t" // Accummulate. +" fmla v12.4s,v5.4s,v2.s[2] \n\t" // Accummulate. +" fmla v13.4s,v6.4s,v2.s[2] \n\t" // Accummulate. +" fmla v14.4s,v5.4s,v2.s[3] \n\t" // Accummulate. +" fmla v15.4s,v6.4s,v2.s[3] \n\t" // Accummulate. +" \n\t" +" fmla v16.4s,v5.4s,v3.s[0] \n\t" // Accummulate. +" fmla v17.4s,v6.4s,v3.s[0] \n\t" // Accummulate. +" fmla v18.4s,v5.4s,v3.s[1] \n\t" // Accummulate. +" fmla v19.4s,v6.4s,v3.s[1] \n\t" // Accummulate. +" fmla v20.4s,v5.4s,v3.s[2] \n\t" // Accummulate. +" fmla v21.4s,v6.4s,v3.s[2] \n\t" // Accummulate. +" fmla v22.4s,v5.4s,v3.s[3] \n\t" // Accummulate. +" fmla v23.4s,v6.4s,v3.s[3] \n\t" // Accummulate. +" \n\t" +" fmla v24.4s,v5.4s,v4.s[0] \n\t" // Accummulate. +" fmla v26.4s,v5.4s,v4.s[1] \n\t" // Accummulate. +" fmla v28.4s,v5.4s,v4.s[2] \n\t" // Accummulate. +" fmla v30.4s,v5.4s,v4.s[3] \n\t" // Accummulate. +" \n\t" +" fmla v25.4s,v6.4s,v4.s[0] \n\t" // Accummulate. +" fmla v27.4s,v6.4s,v4.s[1] \n\t" // Accummulate. +" fmla v29.4s,v6.4s,v4.s[2] \n\t" // Accummulate. +" fmla v31.4s,v6.4s,v4.s[3] \n\t" // Accummulate. +" add x1, x1, #144 \n\t" +" add x0, x0, #96 \n\t" +" \n\t" //End It 4 " \n\t" " .SCONSIDERKLEFT: \n\t" " cmp x6,0 \n\t" // If k_left == 0, we are done. @@ -233,165 +453,595 @@ __asm__ volatile " \n\t" " .SLOOPKLEFT: \n\t" // Body of the left iterations " \n\t" -" prfm pldl1keep,[x0,#1024] \n\t" // Prefetch. -" prfm pldl1keep,[x1,#1024] \n\t" // Prefetch. +" ldr q0, [x0],#16 \n\t" +" ldr q1, [x0],#16 \n\t" // Load a " \n\t" -" ldr q0,[x0] \n\t" // Load a into quad (next iteration). -" ldr q4,[x1] \n\t" // Load b into quad (next iteration). -" \n\t" -" add x0,x0,16 \n\t" // Update a_ptr. -" add x1,x1,16 \n\t" // Update b_ptr. +" ldr q2, [x1],#16 \n\t" // Load b +" ldr q3, [x1],#16 \n\t" +" ldr q4, [x1],#16 \n\t" " \n\t" " sub x6,x6,1 \n\t" // i = i-1. " \n\t" -" fmla v12.4s,v0.4s,v4.s[0] \n\t" // Accummulate. -" fmla v13.4s,v0.4s,v4.s[1] \n\t" // Accummulate. +" fmla v8.4s,v0.4s,v2.s[0] \n\t" // Accummulate. +" fmla v9.4s,v1.4s,v2.s[0] \n\t" // Accummulate. +" fmla v10.4s,v0.4s,v2.s[1] \n\t" // Accummulate. +" fmla v11.4s,v1.4s,v2.s[1] \n\t" // Accummulate. +" fmla v12.4s,v0.4s,v2.s[2] \n\t" // Accummulate. +" fmla v13.4s,v1.4s,v2.s[2] \n\t" // Accummulate. +" fmla v14.4s,v0.4s,v2.s[3] \n\t" // Accummulate. +" fmla v15.4s,v1.4s,v2.s[3] \n\t" // Accummulate. " \n\t" -" fmla v14.4s,v0.4s,v4.s[2] \n\t" // Accummulate. -" fmla v15.4s,v0.4s,v4.s[3] \n\t" // Accummulate. +" fmla v16.4s,v0.4s,v3.s[0] \n\t" // Accummulate. +" fmla v17.4s,v1.4s,v3.s[0] \n\t" // Accummulate. +" fmla v18.4s,v0.4s,v3.s[1] \n\t" // Accummulate. +" fmla v19.4s,v1.4s,v3.s[1] \n\t" // Accummulate. +" fmla v20.4s,v0.4s,v3.s[2] \n\t" // Accummulate. +" fmla v21.4s,v1.4s,v3.s[2] \n\t" // Accummulate. +" fmla v22.4s,v0.4s,v3.s[3] \n\t" // Accummulate. +" fmla v23.4s,v1.4s,v3.s[3] \n\t" // Accummulate. +" \n\t" +" fmla v24.4s,v0.4s,v4.s[0] \n\t" // Accummulate. +" fmla v26.4s,v0.4s,v4.s[1] \n\t" // Accummulate. +" fmla v28.4s,v0.4s,v4.s[2] \n\t" // Accummulate. +" fmla v30.4s,v0.4s,v4.s[3] \n\t" // Accummulate. +" fmla v25.4s,v1.4s,v4.s[0] \n\t" // Accummulate. +" fmla v27.4s,v1.4s,v4.s[1] \n\t" // Accummulate. +" fmla v29.4s,v1.4s,v4.s[2] \n\t" // Accummulate. +" fmla v31.4s,v1.4s,v4.s[3] \n\t" // Accummulate. " \n\t" " cmp x6,0 \n\t" // Iterate again. " bne .SLOOPKLEFT \n\t" // if i!=0. " \n\t" -" ld1r {v30.4s},[x7] \n\t" // Load alpha. -" \n\t" -" ldr x2,%[caddr] \n\t" // Load address of C. -" \n\t" " .SPOSTACCUM: \n\t" +" \n\t" +" ld1r {v6.4s},[x7] \n\t" // Load alpha. +" ld1r {v7.4s},[x8] \n\t" // Load beta +" \n\t" " cmp x13,#1 \n\t" // If rs_c != 1 (column-major) " bne .SGENSTORED \n\t" " \n\t" -" \n\t" " .SCOLSTORED: \n\t" // C is column-major. " \n\t" -" fcmp s31,#0.0 \n\t" -" beq .BETAZEROCOLSTORED \n\t" // Taking care of the beta==0 case. +" dup v0.4s, wzr \n\t" +" dup v1.4s, wzr \n\t" +" dup v2.4s, wzr \n\t" +" dup v3.4s, wzr \n\t" +" dup v4.4s, wzr \n\t" +" dup v5.4s, wzr \n\t" " \n\t" -" \n\t" // If beta!=0, then we can read from C. -" ld1 {v8.4s},[x2],x10 \n\t" // Load c into quad and increment by cs_c. -" ld1 {v9.4s},[x2],x10 \n\t" // Load c+4 into quad and increment by cs_c. -" ld1 {v10.4s},[x2],x10 \n\t" // Load c+8 into quad and increment by cs_c. -" ld1 {v11.4s},[x2],x10 \n\t" // Load c+16 into quad and increment by cs_c. +" fcmp s7,#0.0 \n\t" +" beq .SBETAZEROCOLSTOREDS1 \n\t" // Taking care of the beta==0 case. " \n\t" -" prfm pldl1keep,[x16,0] \n\t" // Prefetch. -" prfm pldl1keep,[x17,0] \n\t" // Prefetch. +" ldr q0, [x2] \n\t" //Load column 0 of C +" ldr q1, [x2, #16] \n\t" +" ldr q2, [x16] \n\t" //Load column 1 of C +" ldr q3, [x16, #16] \n\t" +" ldr q4, [x17] \n\t" //Load column 2 of C +" ldr q5, [x17, #16] \n\t" " \n\t" +" fmul v0.4s,v0.4s,v7.s[0] \n\t" // Scale by beta +" fmul v1.4s,v1.4s,v7.s[0] \n\t" // Scale by beta +" fmul v2.4s,v2.4s,v7.s[0] \n\t" // Scale by beta +" fmul v3.4s,v3.4s,v7.s[0] \n\t" // Scale by beta +" fmul v4.4s,v4.4s,v7.s[0] \n\t" // Scale by beta +" fmul v5.4s,v5.4s,v7.s[0] \n\t" // Scale by beta " \n\t" -" fmul v8.4s,v8.4s,v31.s[0] \n\t" // Scale by beta. -" fmul v9.4s,v9.4s,v31.s[0] \n\t" // Scale by beta. -" fmul v10.4s,v10.4s,v31.s[0] \n\t" // Scale by beta. -" fmul v11.4s,v11.4s,v31.s[0] \n\t" // Scale by beta. +" .SBETAZEROCOLSTOREDS1: \n\t" " \n\t" -" .BETAZEROCOLSTORED: \n\t" // If beta==0, we won't read from C (nor scale). +" fmla v0.4s,v8.4s,v6.s[0] \n\t" // Scale by alpha +" fmla v1.4s,v9.4s,v6.s[0] \n\t" // Scale by alpha +" fmla v2.4s,v10.4s,v6.s[0] \n\t" // Scale by alpha +" fmla v3.4s,v11.4s,v6.s[0] \n\t" // Scale by alpha +" fmla v4.4s,v12.4s,v6.s[0] \n\t" // Scale by alpha +" fmla v5.4s,v13.4s,v6.s[0] \n\t" // Scale by alpha " \n\t" -" ldr x2,%[caddr] \n\t" // Load address of C. +" str q0, [x2] \n\t" //Store column 0 of C +" str q1, [x2, #16] \n\t" +" str q2, [x16] \n\t" //Store column 1 of C +" str q3, [x16, #16] \n\t" +" str q4, [x17] \n\t" //Store column 2 of C +" str q5, [x17, #16] \n\t" " \n\t" -" fmla v8.4s,v12.4s,v30.s[0] \n\t" // Scale by alpha -" fmla v9.4s,v13.4s,v30.s[0] \n\t" // Scale by alpha -" fmla v10.4s,v14.4s,v30.s[0] \n\t" // Scale by alpha -" fmla v11.4s,v15.4s,v30.s[0] \n\t" // Scale by alpha +" dup v8.4s, wzr \n\t" +" dup v9.4s, wzr \n\t" +" dup v10.4s, wzr \n\t" +" dup v11.4s, wzr \n\t" +" dup v12.4s, wzr \n\t" +" dup v13.4s, wzr \n\t" +" \n\t" +" fcmp s7,#0.0 \n\t" +" beq .SBETAZEROCOLSTOREDS2 \n\t" // Taking care of the beta==0 case. +" \n\t" +" ldr q8, [x18] \n\t" //Load column 3 of C +" ldr q9, [x18, #16] \n\t" +" ldr q10, [x19] \n\t" //Load column 4 of C +" ldr q11, [x19, #16] \n\t" +" ldr q12, [x20] \n\t" //Load column 5 of C +" ldr q13, [x20, #16] \n\t" +" \n\t" +" fmul v8.4s, v8.4s, v7.s[0] \n\t" // Scale by beta +" fmul v9.4s, v9.4s, v7.s[0] \n\t" // Scale by beta +" fmul v10.4s,v10.4s,v7.s[0] \n\t" // Scale by beta +" fmul v11.4s,v11.4s,v7.s[0] \n\t" // Scale by beta +" fmul v12.4s,v12.4s,v7.s[0] \n\t" // Scale by beta +" fmul v13.4s,v13.4s,v7.s[0] \n\t" // Scale by beta +" \n\t" +" .SBETAZEROCOLSTOREDS2: \n\t" +" \n\t" +" fmla v8.4s, v14.4s,v6.s[0] \n\t" // Scale by alpha +" fmla v9.4s, v15.4s,v6.s[0] \n\t" // Scale by alpha +" fmla v10.4s,v16.4s,v6.s[0] \n\t" // Scale by alpha +" fmla v11.4s,v17.4s,v6.s[0] \n\t" // Scale by alpha +" fmla v12.4s,v18.4s,v6.s[0] \n\t" // Scale by alpha +" fmla v13.4s,v19.4s,v6.s[0] \n\t" // Scale by alpha +" \n\t" +" str q8, [x18] \n\t" //Store column 3 of C +" str q9, [x18, #16] \n\t" +" str q10, [x19] \n\t" //Store column 4 of C +" str q11, [x19, #16] \n\t" +" str q12, [x20] \n\t" //Store column 5 of C +" str q13, [x20, #16] \n\t" +" \n\t" +" dup v0.4s, wzr \n\t" +" dup v1.4s, wzr \n\t" +" dup v2.4s, wzr \n\t" +" dup v3.4s, wzr \n\t" +" dup v4.4s, wzr \n\t" +" dup v5.4s, wzr \n\t" +" \n\t" +" fcmp s7,#0.0 \n\t" +" beq .SBETAZEROCOLSTOREDS3 \n\t" // Taking care of the beta==0 case. +" \n\t" +" ldr q0, [x21] \n\t" //Load column 6 of C +" ldr q1, [x21, #16] \n\t" +" ldr q2, [x22] \n\t" //Load column 7 of C +" ldr q3, [x22, #16] \n\t" +" ldr q4, [x23] \n\t" //Load column 8 of C +" ldr q5, [x23, #16] \n\t" +" \n\t" +" fmul v0.4s,v0.4s,v7.s[0] \n\t" // Scale by beta +" fmul v1.4s,v1.4s,v7.s[0] \n\t" // Scale by beta +" fmul v2.4s,v2.4s,v7.s[0] \n\t" // Scale by beta +" fmul v3.4s,v3.4s,v7.s[0] \n\t" // Scale by beta +" fmul v4.4s,v4.4s,v7.s[0] \n\t" // Scale by beta +" fmul v5.4s,v5.4s,v7.s[0] \n\t" // Scale by beta +" \n\t" +" .SBETAZEROCOLSTOREDS3: \n\t" +" \n\t" +" fmla v0.4s,v20.4s,v6.s[0] \n\t" // Scale by alpha +" fmla v1.4s,v21.4s,v6.s[0] \n\t" // Scale by alpha +" fmla v2.4s,v22.4s,v6.s[0] \n\t" // Scale by alpha +" fmla v3.4s,v23.4s,v6.s[0] \n\t" // Scale by alpha +" fmla v4.4s,v24.4s,v6.s[0] \n\t" // Scale by alpha +" fmla v5.4s,v25.4s,v6.s[0] \n\t" // Scale by alpha +" \n\t" +" str q0, [x21] \n\t" //Store column 6 of C +" str q1, [x21, #16] \n\t" +" str q2, [x22] \n\t" //Store column 7 of C +" str q3, [x22, #16] \n\t" +" str q4, [x23] \n\t" //Store column 8 of C +" str q5, [x23, #16] \n\t" +" \n\t" +" dup v8.4s, wzr \n\t" +" dup v9.4s, wzr \n\t" +" dup v10.4s, wzr \n\t" +" dup v11.4s, wzr \n\t" +" dup v12.4s, wzr \n\t" +" dup v13.4s, wzr \n\t" +" \n\t" +" fcmp s7,#0.0 \n\t" +" beq .SBETAZEROCOLSTOREDS4 \n\t" // Taking care of the beta==0 case. +" \n\t" +" ldr q8, [x24] \n\t" //Load column 9 of C +" ldr q9, [x24, #16] \n\t" +" ldr q10, [x25] \n\t" //Load column 10 of C +" ldr q11, [x25, #16] \n\t" +" ldr q12, [x26] \n\t" //Load column 11 of C +" ldr q13, [x26, #16] \n\t" +" \n\t" +" fmul v8.4s, v8.4s, v7.s[0] \n\t" // Scale by beta +" fmul v9.4s, v9.4s, v7.s[0] \n\t" // Scale by beta +" fmul v10.4s,v10.4s,v7.s[0] \n\t" // Scale by beta +" fmul v11.4s,v11.4s,v7.s[0] \n\t" // Scale by beta +" fmul v12.4s,v12.4s,v7.s[0] \n\t" // Scale by beta +" fmul v13.4s,v13.4s,v7.s[0] \n\t" // Scale by beta +" \n\t" +" .SBETAZEROCOLSTOREDS4: \n\t" +" \n\t" +" prfm pldl2keep,[x3] \n\t" +" prfm pldl2keep,[x4] \n\t" +" \n\t" +" fmla v8.4s, v26.4s,v6.s[0] \n\t" // Scale by alpha +" fmla v9.4s, v27.4s,v6.s[0] \n\t" // Scale by alpha +" fmla v10.4s,v28.4s,v6.s[0] \n\t" // Scale by alpha +" fmla v11.4s,v29.4s,v6.s[0] \n\t" // Scale by alpha +" fmla v12.4s,v30.4s,v6.s[0] \n\t" // Scale by alpha +" fmla v13.4s,v31.4s,v6.s[0] \n\t" // Scale by alpha +" \n\t" +" str q8, [x24] \n\t" //Store column 9 of C +" str q9, [x24, #16] \n\t" +" str q10, [x25] \n\t" //Store column 10 of C +" str q11, [x25, #16] \n\t" +" str q12, [x26] \n\t" //Store column 11 of C +" str q13, [x26, #16] \n\t" " \n\t" -" st1 {v8.4s},[x2],x10 \n\t" // Store quad into c and increment by cs_c -" st1 {v9.4s},[x2],x10 \n\t" // Store quad into c+4 and increment by cs_c -" st1 {v10.4s},[x2],x10 \n\t" // Store quad into c+8 and increment by cs_c -" st1 {v11.4s},[x2],x10 \n\t" // Store quad into c+16 and increment by cs_c " \n\t" " b .SEND \n\t" // Done (TODO: this obviously needs to be moved down to remove jump). " \n\t" " \n\t" " .SGENSTORED: \n\t" // C is general-stride stored. " \n\t" -" fcmp s31,#0.0 \n\t" -" beq .BETAZEROGENSTORED \n\t" " \n\t" -" \n\t" // If beta!=0, then we can read from C. -" \n\t" // TODO: this was done fast. Rearrange to remove so many address reloads. -" ldr x2,%[caddr] \n\t" // Load address of C. +" dup v0.4s, wzr \n\t" +" dup v1.4s, wzr \n\t" +" dup v2.4s, wzr \n\t" +" dup v3.4s, wzr \n\t" +" dup v4.4s, wzr \n\t" +" dup v5.4s, wzr \n\t" " \n\t" -" ld1 {v8.s}[0],[x2],x14 \n\t" // Load c00 into quad and increment by rs_c. -" ld1 {v8.s}[1],[x2],x14 \n\t" // Load c01 into quad and increment by rs_c. -" ld1 {v8.s}[2],[x2],x14 \n\t" // Load c02 into quad and increment by rs_c. -" ld1 {v8.s}[3],[x2],x14 \n\t" // Load c03 into quad and increment by rs_c. +" fcmp s7,#0.0 \n\t" +" beq .SBETAZEROGENSTOREDS1 \n\t" // Taking care of the beta==0 case. " \n\t" -" ldr x2,%[caddr] \n\t" // Load address of C. -" add x2,x2,x10 \n\t" // c += cs_c. +" mov x27, x2 \n\t" " \n\t" -" ld1 {v9.s}[0],[x2],x14 \n\t" // Load c10 into quad and increment by rs_c. -" ld1 {v9.s}[1],[x2],x14 \n\t" // Load c11 into quad and increment by rs_c. -" ld1 {v9.s}[2],[x2],x14 \n\t" // Load c12 into quad and increment by rs_c. -" ld1 {v9.s}[3],[x2],x14 \n\t" // Load c13 into quad and increment by rs_c. +" ld1 {v0.s}[0],[x27],x14 \n\t" // Load c00 into quad and increment by rs_c. +" ld1 {v0.s}[1],[x27],x14 \n\t" // Load c01 into quad and increment by rs_c. +" ld1 {v0.s}[2],[x27],x14 \n\t" // Load c02 into quad and increment by rs_c. +" ld1 {v0.s}[3],[x27],x14 \n\t" // Load c03 into quad and increment by rs_c. +" ld1 {v1.s}[0],[x27],x14 \n\t" // Load c04 into quad and increment by rs_c. +" ld1 {v1.s}[1],[x27],x14 \n\t" // Load c05 into quad and increment by rs_c. +" ld1 {v1.s}[2],[x27],x14 \n\t" // Load c06 into quad and increment by rs_c. +" ld1 {v1.s}[3],[x27],x14 \n\t" // Load c07 into quad and increment by rs_c. " \n\t" -" ldr x2,%[caddr] \n\t" // Load address of C. -" add x2,x2,x10 \n\t" // c += cs_c. -" add x2,x2,x10 \n\t" // c += cs_c. +" mov x27, x16 \n\t" " \n\t" -" ld1 {v10.s}[0],[x2],x14 \n\t" // Load c10 into quad and increment by rs_c. -" ld1 {v10.s}[1],[x2],x14 \n\t" // Load c11 into quad and increment by rs_c. -" ld1 {v10.s}[2],[x2],x14 \n\t" // Load c12 into quad and increment by rs_c. -" ld1 {v10.s}[3],[x2],x14 \n\t" // Load c13 into quad and increment by rs_c. +" ld1 {v2.s}[0],[x27],x14 \n\t" // Load c10 into quad and increment by rs_c. +" ld1 {v2.s}[1],[x27],x14 \n\t" // Load c11 into quad and increment by rs_c. +" ld1 {v2.s}[2],[x27],x14 \n\t" // Load c12 into quad and increment by rs_c. +" ld1 {v2.s}[3],[x27],x14 \n\t" // Load c13 into quad and increment by rs_c. +" ld1 {v3.s}[0],[x27],x14 \n\t" // Load c14 into quad and increment by rs_c. +" ld1 {v3.s}[1],[x27],x14 \n\t" // Load c15 into quad and increment by rs_c. +" ld1 {v3.s}[2],[x27],x14 \n\t" // Load c16 into quad and increment by rs_c. +" ld1 {v3.s}[3],[x27],x14 \n\t" // Load c17 into quad and increment by rs_c. " \n\t" -" ldr x2,%[caddr] \n\t" // Load address of C. -" add x2,x2,x10 \n\t" // c += cs_c. -" add x2,x2,x10 \n\t" // c += cs_c. -" add x2,x2,x10 \n\t" // c += cs_c. +" mov x27, x17 \n\t" " \n\t" -" ld1 {v11.s}[0],[x2],x14 \n\t" // Load c10 into quad and increment by rs_c. -" ld1 {v11.s}[1],[x2],x14 \n\t" // Load c11 into quad and increment by rs_c. -" ld1 {v11.s}[2],[x2],x14 \n\t" // Load c12 into quad and increment by rs_c. -" ld1 {v11.s}[3],[x2],x14 \n\t" // Load c13 into quad and increment by rs_c. +" ld1 {v4.s}[0],[x27],x14 \n\t" // Load c20 into quad and increment by rs_c. +" ld1 {v4.s}[1],[x27],x14 \n\t" // Load c21 into quad and increment by rs_c. +" ld1 {v4.s}[2],[x27],x14 \n\t" // Load c22 into quad and increment by rs_c. +" ld1 {v4.s}[3],[x27],x14 \n\t" // Load c23 into quad and increment by rs_c. +" ld1 {v5.s}[0],[x27],x14 \n\t" // Load c24 into quad and increment by rs_c. +" ld1 {v5.s}[1],[x27],x14 \n\t" // Load c25 into quad and increment by rs_c. +" ld1 {v5.s}[2],[x27],x14 \n\t" // Load c26 into quad and increment by rs_c. +" ld1 {v5.s}[3],[x27],x14 \n\t" // Load c27 into quad and increment by rs_c. " \n\t" +" fmul v0.4s,v0.4s,v7.s[0] \n\t" // Scale by beta +" fmul v1.4s,v1.4s,v7.s[0] \n\t" // Scale by beta +" fmul v2.4s,v2.4s,v7.s[0] \n\t" // Scale by beta +" fmul v3.4s,v3.4s,v7.s[0] \n\t" // Scale by beta +" fmul v4.4s,v4.4s,v7.s[0] \n\t" // Scale by beta +" fmul v5.4s,v5.4s,v7.s[0] \n\t" // Scale by beta " \n\t" -" prfm pldl1keep,[x16,0] \n\t" // Prefetch. -" prfm pldl1keep,[x17,0] \n\t" // Prefetch. +" .SBETAZEROGENSTOREDS1: \n\t" " \n\t" -" fmul v8.4s,v8.4s,v31.s[0] \n\t" // Scale by beta. -" fmul v9.4s,v9.4s,v31.s[0] \n\t" // Scale by beta. -" fmul v10.4s,v10.4s,v31.s[0] \n\t" // Scale by beta. -" fmul v11.4s,v11.4s,v31.s[0] \n\t" // Scale by beta. +" fmla v0.4s, v8.4s,v6.s[0] \n\t" // Scale by alpha +" fmla v1.4s, v9.4s,v6.s[0] \n\t" // Scale by alpha +" fmla v2.4s,v10.4s,v6.s[0] \n\t" // Scale by alpha +" fmla v3.4s,v11.4s,v6.s[0] \n\t" // Scale by alpha +" fmla v4.4s,v12.4s,v6.s[0] \n\t" // Scale by alpha +" fmla v5.4s,v13.4s,v6.s[0] \n\t" // Scale by alpha " \n\t" -" .BETAZEROGENSTORED: \n\t" // If beta==0, we cannot read from C (nor scale). +" mov x27, x2 \n\t" " \n\t" -" fmla v8.4s,v12.4s,v30.s[0] \n\t" // Scale by alpha. -" fmla v9.4s,v13.4s,v30.s[0] \n\t" // Scale by alpha. -" fmla v10.4s,v14.4s,v30.s[0] \n\t" // Scale by alpha. -" fmla v11.4s,v15.4s,v30.s[0] \n\t" // Scale by alpha. +" st1 {v0.s}[0],[x27],x14 \n\t" // Store c00 into quad and increment by rs_c. +" st1 {v0.s}[1],[x27],x14 \n\t" // Store c01 into quad and increment by rs_c. +" st1 {v0.s}[2],[x27],x14 \n\t" // Store c02 into quad and increment by rs_c. +" st1 {v0.s}[3],[x27],x14 \n\t" // Store c03 into quad and increment by rs_c. +" st1 {v1.s}[0],[x27],x14 \n\t" // Store c04 into quad and increment by rs_c. +" st1 {v1.s}[1],[x27],x14 \n\t" // Store c05 into quad and increment by rs_c. +" st1 {v1.s}[2],[x27],x14 \n\t" // Store c06 into quad and increment by rs_c. +" st1 {v1.s}[3],[x27],x14 \n\t" // Store c07 into quad and increment by rs_c. " \n\t" +" mov x27, x16 \n\t" " \n\t" -" ldr x2,%[caddr] \n\t" // Load address of C. +" st1 {v2.s}[0],[x27],x14 \n\t" // Store c10 into quad and increment by rs_c. +" st1 {v2.s}[1],[x27],x14 \n\t" // Store c11 into quad and increment by rs_c. +" st1 {v2.s}[2],[x27],x14 \n\t" // Store c12 into quad and increment by rs_c. +" st1 {v2.s}[3],[x27],x14 \n\t" // Store c13 into quad and increment by rs_c. +" st1 {v3.s}[0],[x27],x14 \n\t" // Store c14 into quad and increment by rs_c. +" st1 {v3.s}[1],[x27],x14 \n\t" // Store c15 into quad and increment by rs_c. +" st1 {v3.s}[2],[x27],x14 \n\t" // Store c16 into quad and increment by rs_c. +" st1 {v3.s}[3],[x27],x14 \n\t" // Store c17 into quad and increment by rs_c. " \n\t" -" st1 {v8.s}[0],[x2],x14 \n\t" // Store c00 into quad and increment by rs_c. -" st1 {v8.s}[1],[x2],x14 \n\t" // Store c01 into quad and increment by rs_c. -" st1 {v8.s}[2],[x2],x14 \n\t" // Store c02 into quad and increment by rs_c. -" st1 {v8.s}[3],[x2],x14 \n\t" // Store c03 into quad and increment by rs_c. +" mov x27, x17 \n\t" " \n\t" -" ldr x2,%[caddr] \n\t" // Load address of C. -" add x2,x2,x10 \n\t" // c += cs_c. +" st1 {v4.s}[0],[x27],x14 \n\t" // Store c20 into quad and increment by rs_c. +" st1 {v4.s}[1],[x27],x14 \n\t" // Store c21 into quad and increment by rs_c. +" st1 {v4.s}[2],[x27],x14 \n\t" // Store c22 into quad and increment by rs_c. +" st1 {v4.s}[3],[x27],x14 \n\t" // Store c23 into quad and increment by rs_c. +" st1 {v5.s}[0],[x27],x14 \n\t" // Store c24 into quad and increment by rs_c. +" st1 {v5.s}[1],[x27],x14 \n\t" // Store c25 into quad and increment by rs_c. +" st1 {v5.s}[2],[x27],x14 \n\t" // Store c26 into quad and increment by rs_c. +" st1 {v5.s}[3],[x27],x14 \n\t" // Store c27 into quad and increment by rs_c. " \n\t" -" st1 {v9.s}[0],[x2],x14 \n\t" // Store c10 into quad and increment by rs_c. -" st1 {v9.s}[1],[x2],x14 \n\t" // Store c11 into quad and increment by rs_c. -" st1 {v9.s}[2],[x2],x14 \n\t" // Store c12 into quad and increment by rs_c. -" st1 {v9.s}[3],[x2],x14 \n\t" // Store c13 into quad and increment by rs_c. +" dup v8.4s, wzr \n\t" +" dup v9.4s, wzr \n\t" +" dup v10.4s, wzr \n\t" +" dup v11.4s, wzr \n\t" +" dup v12.4s, wzr \n\t" +" dup v13.4s, wzr \n\t" " \n\t" -" ldr x2,%[caddr] \n\t" // Load address of C. -" add x2,x2,x10 \n\t" // c += cs_c. -" add x2,x2,x10 \n\t" // c += cs_c. +" fcmp s7,#0.0 \n\t" +" beq .SBETAZEROGENSTOREDS2 \n\t" // Taking care of the beta==0 case. " \n\t" -" st1 {v10.s}[0],[x2],x14 \n\t" // Store c10 into quad and increment by rs_c. -" st1 {v10.s}[1],[x2],x14 \n\t" // Store c11 into quad and increment by rs_c. -" st1 {v10.s}[2],[x2],x14 \n\t" // Store c12 into quad and increment by rs_c. -" st1 {v10.s}[3],[x2],x14 \n\t" // Store c13 into quad and increment by rs_c. +" mov x27, x18 \n\t" " \n\t" -" ldr x2,%[caddr] \n\t" // Load address of C. -" add x2,x2,x10 \n\t" // c += cs_c. -" add x2,x2,x10 \n\t" // c += cs_c. -" add x2,x2,x10 \n\t" // c += cs_c. +" ld1 {v8.s}[0],[x27],x14 \n\t" // Load c30 into quad and increment by rs_c. +" ld1 {v8.s}[1],[x27],x14 \n\t" // Load c31 into quad and increment by rs_c. +" ld1 {v8.s}[2],[x27],x14 \n\t" // Load c32 into quad and increment by rs_c. +" ld1 {v8.s}[3],[x27],x14 \n\t" // Load c33 into quad and increment by rs_c. +" ld1 {v9.s}[0],[x27],x14 \n\t" // Load c34 into quad and increment by rs_c. +" ld1 {v9.s}[1],[x27],x14 \n\t" // Load c35 into quad and increment by rs_c. +" ld1 {v9.s}[2],[x27],x14 \n\t" // Load c36 into quad and increment by rs_c. +" ld1 {v9.s}[3],[x27],x14 \n\t" // Load c37 into quad and increment by rs_c. " \n\t" -" st1 {v11.s}[0],[x2],x14 \n\t" // Store c10 into quad and increment by rs_c. -" st1 {v11.s}[1],[x2],x14 \n\t" // Store c11 into quad and increment by rs_c. -" st1 {v11.s}[2],[x2],x14 \n\t" // Store c12 into quad and increment by rs_c. -" st1 {v11.s}[3],[x2],x14 \n\t" // Store c13 into quad and increment by rs_c. +" mov x27, x19 \n\t" " \n\t" +" ld1 {v10.s}[0],[x27],x14 \n\t" // Load c40 into quad and increment by rs_c. +" ld1 {v10.s}[1],[x27],x14 \n\t" // Load c41 into quad and increment by rs_c. +" ld1 {v10.s}[2],[x27],x14 \n\t" // Load c42 into quad and increment by rs_c. +" ld1 {v10.s}[3],[x27],x14 \n\t" // Load c43 into quad and increment by rs_c. +" ld1 {v11.s}[0],[x27],x14 \n\t" // Load c44 into quad and increment by rs_c. +" ld1 {v11.s}[1],[x27],x14 \n\t" // Load c45 into quad and increment by rs_c. +" ld1 {v11.s}[2],[x27],x14 \n\t" // Load c46 into quad and increment by rs_c. +" ld1 {v11.s}[3],[x27],x14 \n\t" // Load c47 into quad and increment by rs_c. " \n\t" +" mov x27, x20 \n\t" +" \n\t" +" ld1 {v12.s}[0],[x27],x14 \n\t" // Load c50 into quad and increment by rs_c. +" ld1 {v12.s}[1],[x27],x14 \n\t" // Load c51 into quad and increment by rs_c. +" ld1 {v12.s}[2],[x27],x14 \n\t" // Load c52 into quad and increment by rs_c. +" ld1 {v12.s}[3],[x27],x14 \n\t" // Load c53 into quad and increment by rs_c. +" ld1 {v13.s}[0],[x27],x14 \n\t" // Load c54 into quad and increment by rs_c. +" ld1 {v13.s}[1],[x27],x14 \n\t" // Load c55 into quad and increment by rs_c. +" ld1 {v13.s}[2],[x27],x14 \n\t" // Load c56 into quad and increment by rs_c. +" ld1 {v13.s}[3],[x27],x14 \n\t" // Load c57 into quad and increment by rs_c. +" \n\t" +" fmul v8.4s, v8.4s, v7.s[0] \n\t" // Scale by beta +" fmul v9.4s, v9.4s, v7.s[0] \n\t" // Scale by beta +" fmul v10.4s,v10.4s,v7.s[0] \n\t" // Scale by beta +" fmul v11.4s,v11.4s,v7.s[0] \n\t" // Scale by beta +" fmul v12.4s,v12.4s,v7.s[0] \n\t" // Scale by beta +" fmul v13.4s,v13.4s,v7.s[0] \n\t" // Scale by beta +" \n\t" +" .SBETAZEROGENSTOREDS2: \n\t" +" \n\t" +" fmla v8.4s, v14.4s,v6.s[0] \n\t" // Scale by alpha +" fmla v9.4s, v15.4s,v6.s[0] \n\t" // Scale by alpha +" fmla v10.4s,v16.4s,v6.s[0] \n\t" // Scale by alpha +" fmla v11.4s,v17.4s,v6.s[0] \n\t" // Scale by alpha +" fmla v12.4s,v18.4s,v6.s[0] \n\t" // Scale by alpha +" fmla v13.4s,v19.4s,v6.s[0] \n\t" // Scale by alpha +" \n\t" +" mov x27, x18 \n\t" +" \n\t" +" st1 {v8.s}[0],[x27],x14 \n\t" // Store c30 into quad and increment by rs_c. +" st1 {v8.s}[1],[x27],x14 \n\t" // Store c31 into quad and increment by rs_c. +" st1 {v8.s}[2],[x27],x14 \n\t" // Store c32 into quad and increment by rs_c. +" st1 {v8.s}[3],[x27],x14 \n\t" // Store c33 into quad and increment by rs_c. +" st1 {v9.s}[0],[x27],x14 \n\t" // Store c34 into quad and increment by rs_c. +" st1 {v9.s}[1],[x27],x14 \n\t" // Store c35 into quad and increment by rs_c. +" st1 {v9.s}[2],[x27],x14 \n\t" // Store c36 into quad and increment by rs_c. +" st1 {v9.s}[3],[x27],x14 \n\t" // Store c37 into quad and increment by rs_c. +" \n\t" +" mov x27, x19 \n\t" +" \n\t" +" st1 {v10.s}[0],[x27],x14 \n\t" // Store c40 into quad and increment by rs_c. +" st1 {v10.s}[1],[x27],x14 \n\t" // Store c41 into quad and increment by rs_c. +" st1 {v10.s}[2],[x27],x14 \n\t" // Store c42 into quad and increment by rs_c. +" st1 {v10.s}[3],[x27],x14 \n\t" // Store c43 into quad and increment by rs_c. +" st1 {v11.s}[0],[x27],x14 \n\t" // Store c44 into quad and increment by rs_c. +" st1 {v11.s}[1],[x27],x14 \n\t" // Store c45 into quad and increment by rs_c. +" st1 {v11.s}[2],[x27],x14 \n\t" // Store c46 into quad and increment by rs_c. +" st1 {v11.s}[3],[x27],x14 \n\t" // Store c47 into quad and increment by rs_c. +" \n\t" +" mov x27, x20 \n\t" +" \n\t" +" st1 {v12.s}[0],[x27],x14 \n\t" // Store c50 into quad and increment by rs_c. +" st1 {v12.s}[1],[x27],x14 \n\t" // Store c51 into quad and increment by rs_c. +" st1 {v12.s}[2],[x27],x14 \n\t" // Store c52 into quad and increment by rs_c. +" st1 {v12.s}[3],[x27],x14 \n\t" // Store c53 into quad and increment by rs_c. +" st1 {v13.s}[0],[x27],x14 \n\t" // Store c54 into quad and increment by rs_c. +" st1 {v13.s}[1],[x27],x14 \n\t" // Store c55 into quad and increment by rs_c. +" st1 {v13.s}[2],[x27],x14 \n\t" // Store c56 into quad and increment by rs_c. +" st1 {v13.s}[3],[x27],x14 \n\t" // Store c57 into quad and increment by rs_c. +" \n\t" +" dup v0.4s, wzr \n\t" +" dup v1.4s, wzr \n\t" +" dup v2.4s, wzr \n\t" +" dup v3.4s, wzr \n\t" +" dup v4.4s, wzr \n\t" +" dup v5.4s, wzr \n\t" +" \n\t" +" fcmp s7,#0.0 \n\t" +" beq .SBETAZEROGENSTOREDS3 \n\t" // Taking care of the beta==0 case. +" \n\t" +" mov x27, x21 \n\t" +" \n\t" +" ld1 {v0.s}[0],[x27],x14 \n\t" // Load c60 into quad and increment by rs_c. +" ld1 {v0.s}[1],[x27],x14 \n\t" // Load c61 into quad and increment by rs_c. +" ld1 {v0.s}[2],[x27],x14 \n\t" // Load c62 into quad and increment by rs_c. +" ld1 {v0.s}[3],[x27],x14 \n\t" // Load c63 into quad and increment by rs_c. +" ld1 {v1.s}[0],[x27],x14 \n\t" // Load c64 into quad and increment by rs_c. +" ld1 {v1.s}[1],[x27],x14 \n\t" // Load c65 into quad and increment by rs_c. +" ld1 {v1.s}[2],[x27],x14 \n\t" // Load c66 into quad and increment by rs_c. +" ld1 {v1.s}[3],[x27],x14 \n\t" // Load c67 into quad and increment by rs_c. +" \n\t" +" mov x27, x22 \n\t" +" \n\t" +" ld1 {v2.s}[0],[x27],x14 \n\t" // Load c70 into quad and increment by rs_c. +" ld1 {v2.s}[1],[x27],x14 \n\t" // Load c71 into quad and increment by rs_c. +" ld1 {v2.s}[2],[x27],x14 \n\t" // Load c72 into quad and increment by rs_c. +" ld1 {v2.s}[3],[x27],x14 \n\t" // Load c73 into quad and increment by rs_c. +" ld1 {v3.s}[0],[x27],x14 \n\t" // Load c74 into quad and increment by rs_c. +" ld1 {v3.s}[1],[x27],x14 \n\t" // Load c75 into quad and increment by rs_c. +" ld1 {v3.s}[2],[x27],x14 \n\t" // Load c76 into quad and increment by rs_c. +" ld1 {v3.s}[3],[x27],x14 \n\t" // Load c77 into quad and increment by rs_c. +" \n\t" +" mov x27, x23 \n\t" +" \n\t" +" ld1 {v4.s}[0],[x27],x14 \n\t" // Load c80 into quad and increment by rs_c. +" ld1 {v4.s}[1],[x27],x14 \n\t" // Load c81 into quad and increment by rs_c. +" ld1 {v4.s}[2],[x27],x14 \n\t" // Load c82 into quad and increment by rs_c. +" ld1 {v4.s}[3],[x27],x14 \n\t" // Load c83 into quad and increment by rs_c. +" ld1 {v5.s}[0],[x27],x14 \n\t" // Load c84 into quad and increment by rs_c. +" ld1 {v5.s}[1],[x27],x14 \n\t" // Load c85 into quad and increment by rs_c. +" ld1 {v5.s}[2],[x27],x14 \n\t" // Load c86 into quad and increment by rs_c. +" ld1 {v5.s}[3],[x27],x14 \n\t" // Load c87 into quad and increment by rs_c. +" \n\t" +" fmul v0.4s,v0.4s,v7.s[0] \n\t" // Scale by beta +" fmul v1.4s,v1.4s,v7.s[0] \n\t" // Scale by beta +" fmul v2.4s,v2.4s,v7.s[0] \n\t" // Scale by beta +" fmul v3.4s,v3.4s,v7.s[0] \n\t" // Scale by beta +" fmul v4.4s,v4.4s,v7.s[0] \n\t" // Scale by beta +" fmul v5.4s,v5.4s,v7.s[0] \n\t" // Scale by beta +" \n\t" +" .SBETAZEROGENSTOREDS3: \n\t" +" \n\t" +" fmla v0.4s,v20.4s,v6.s[0] \n\t" // Scale by alpha +" fmla v1.4s,v21.4s,v6.s[0] \n\t" // Scale by alpha +" fmla v2.4s,v22.4s,v6.s[0] \n\t" // Scale by alpha +" fmla v3.4s,v23.4s,v6.s[0] \n\t" // Scale by alpha +" fmla v4.4s,v24.4s,v6.s[0] \n\t" // Scale by alpha +" fmla v5.4s,v25.4s,v6.s[0] \n\t" // Scale by alpha +" \n\t" +" mov x27, x21 \n\t" +" \n\t" +" st1 {v0.s}[0],[x27],x14 \n\t" // Store c60 into quad and increment by rs_c. +" st1 {v0.s}[1],[x27],x14 \n\t" // Store c61 into quad and increment by rs_c. +" st1 {v0.s}[2],[x27],x14 \n\t" // Store c62 into quad and increment by rs_c. +" st1 {v0.s}[3],[x27],x14 \n\t" // Store c63 into quad and increment by rs_c. +" st1 {v1.s}[0],[x27],x14 \n\t" // Store c64 into quad and increment by rs_c. +" st1 {v1.s}[1],[x27],x14 \n\t" // Store c65 into quad and increment by rs_c. +" st1 {v1.s}[2],[x27],x14 \n\t" // Store c66 into quad and increment by rs_c. +" st1 {v1.s}[3],[x27],x14 \n\t" // Store c67 into quad and increment by rs_c. +" \n\t" +" mov x27, x22 \n\t" +" \n\t" +" st1 {v2.s}[0],[x27],x14 \n\t" // Store c70 into quad and increment by rs_c. +" st1 {v2.s}[1],[x27],x14 \n\t" // Store c71 into quad and increment by rs_c. +" st1 {v2.s}[2],[x27],x14 \n\t" // Store c72 into quad and increment by rs_c. +" st1 {v2.s}[3],[x27],x14 \n\t" // Store c73 into quad and increment by rs_c. +" st1 {v3.s}[0],[x27],x14 \n\t" // Store c74 into quad and increment by rs_c. +" st1 {v3.s}[1],[x27],x14 \n\t" // Store c75 into quad and increment by rs_c. +" st1 {v3.s}[2],[x27],x14 \n\t" // Store c76 into quad and increment by rs_c. +" st1 {v3.s}[3],[x27],x14 \n\t" // Store c77 into quad and increment by rs_c. +" \n\t" +" mov x27, x23 \n\t" +" \n\t" +" st1 {v4.s}[0],[x27],x14 \n\t" // Store c80 into quad and increment by rs_c. +" st1 {v4.s}[1],[x27],x14 \n\t" // Store c81 into quad and increment by rs_c. +" st1 {v4.s}[2],[x27],x14 \n\t" // Store c82 into quad and increment by rs_c. +" st1 {v4.s}[3],[x27],x14 \n\t" // Store c83 into quad and increment by rs_c. +" st1 {v5.s}[0],[x27],x14 \n\t" // Store c84 into quad and increment by rs_c. +" st1 {v5.s}[1],[x27],x14 \n\t" // Store c85 into quad and increment by rs_c. +" st1 {v5.s}[2],[x27],x14 \n\t" // Store c86 into quad and increment by rs_c. +" st1 {v5.s}[3],[x27],x14 \n\t" // Store c87 into quad and increment by rs_c. +" \n\t" +" dup v8.4s, wzr \n\t" +" dup v9.4s, wzr \n\t" +" dup v10.4s, wzr \n\t" +" dup v11.4s, wzr \n\t" +" dup v12.4s, wzr \n\t" +" dup v13.4s, wzr \n\t" +" \n\t" +" fcmp s7,#0.0 \n\t" +" beq .SBETAZEROGENSTOREDS4 \n\t" // Taking care of the beta==0 case. +" \n\t" +" mov x27, x24 \n\t" +" \n\t" +" ld1 {v8.s}[0],[x27],x14 \n\t" // Load c90 into quad and increment by rs_c. +" ld1 {v8.s}[1],[x27],x14 \n\t" // Load c91 into quad and increment by rs_c. +" ld1 {v8.s}[2],[x27],x14 \n\t" // Load c92 into quad and increment by rs_c. +" ld1 {v8.s}[3],[x27],x14 \n\t" // Load c93 into quad and increment by rs_c. +" ld1 {v9.s}[0],[x27],x14 \n\t" // Load c94 into quad and increment by rs_c. +" ld1 {v9.s}[1],[x27],x14 \n\t" // Load c95 into quad and increment by rs_c. +" ld1 {v9.s}[2],[x27],x14 \n\t" // Load c96 into quad and increment by rs_c. +" ld1 {v9.s}[3],[x27],x14 \n\t" // Load c97 into quad and increment by rs_c. +" \n\t" +" mov x27, x25 \n\t" +" \n\t" +" ld1 {v10.s}[0],[x27],x14 \n\t" // Load c100 into quad and increment by rs_c. +" ld1 {v10.s}[1],[x27],x14 \n\t" // Load c101 into quad and increment by rs_c. +" ld1 {v10.s}[2],[x27],x14 \n\t" // Load c102 into quad and increment by rs_c. +" ld1 {v10.s}[3],[x27],x14 \n\t" // Load c103 into quad and increment by rs_c. +" ld1 {v11.s}[0],[x27],x14 \n\t" // Load c104 into quad and increment by rs_c. +" ld1 {v11.s}[1],[x27],x14 \n\t" // Load c105 into quad and increment by rs_c. +" ld1 {v11.s}[2],[x27],x14 \n\t" // Load c106 into quad and increment by rs_c. +" ld1 {v11.s}[3],[x27],x14 \n\t" // Load c107 into quad and increment by rs_c. +" \n\t" +" mov x27, x26 \n\t" +" \n\t" +" ld1 {v12.s}[0],[x27],x14 \n\t" // Load c110 into quad and increment by rs_c. +" ld1 {v12.s}[1],[x27],x14 \n\t" // Load c111 into quad and increment by rs_c. +" ld1 {v12.s}[2],[x27],x14 \n\t" // Load c112 into quad and increment by rs_c. +" ld1 {v12.s}[3],[x27],x14 \n\t" // Load c113 into quad and increment by rs_c. +" ld1 {v13.s}[0],[x27],x14 \n\t" // Load c114 into quad and increment by rs_c. +" ld1 {v13.s}[1],[x27],x14 \n\t" // Load c115 into quad and increment by rs_c. +" ld1 {v13.s}[2],[x27],x14 \n\t" // Load c116 into quad and increment by rs_c. +" ld1 {v13.s}[3],[x27],x14 \n\t" // Load c117 into quad and increment by rs_c. +" \n\t" +" fmul v8.4s, v8.4s, v7.s[0] \n\t" // Scale by beta +" fmul v9.4s, v9.4s, v7.s[0] \n\t" // Scale by beta +" fmul v10.4s,v10.4s,v7.s[0] \n\t" // Scale by beta +" fmul v11.4s,v11.4s,v7.s[0] \n\t" // Scale by beta +" fmul v12.4s,v12.4s,v7.s[0] \n\t" // Scale by beta +" fmul v13.4s,v13.4s,v7.s[0] \n\t" // Scale by beta +" \n\t" +" .SBETAZEROGENSTOREDS4: \n\t" +" \n\t" +" prfm pldl2keep,[x3] \n\t" +" prfm pldl2keep,[x4] \n\t" +" \n\t" +" fmla v8.4s, v26.4s,v6.s[0] \n\t" // Scale by alpha +" fmla v9.4s, v27.4s,v6.s[0] \n\t" // Scale by alpha +" fmla v10.4s,v28.4s,v6.s[0] \n\t" // Scale by alpha +" fmla v11.4s,v29.4s,v6.s[0] \n\t" // Scale by alpha +" fmla v12.4s,v30.4s,v6.s[0] \n\t" // Scale by alpha +" fmla v13.4s,v31.4s,v6.s[0] \n\t" // Scale by alpha +" \n\t" +" mov x27, x24 \n\t" +" \n\t" +" st1 {v8.s}[0],[x27],x14 \n\t" // Store c90 into quad and increment by rs_c. +" st1 {v8.s}[1],[x27],x14 \n\t" // Store c91 into quad and increment by rs_c. +" st1 {v8.s}[2],[x27],x14 \n\t" // Store c92 into quad and increment by rs_c. +" st1 {v8.s}[3],[x27],x14 \n\t" // Store c93 into quad and increment by rs_c. +" st1 {v9.s}[0],[x27],x14 \n\t" // Store c94 into quad and increment by rs_c. +" st1 {v9.s}[1],[x27],x14 \n\t" // Store c95 into quad and increment by rs_c. +" st1 {v9.s}[2],[x27],x14 \n\t" // Store c96 into quad and increment by rs_c. +" st1 {v9.s}[3],[x27],x14 \n\t" // Store c97 into quad and increment by rs_c. +" \n\t" +" mov x27, x25 \n\t" +" \n\t" +" st1 {v10.s}[0],[x27],x14 \n\t" // Store c100 into quad and increment by rs_c. +" st1 {v10.s}[1],[x27],x14 \n\t" // Store c101 into quad and increment by rs_c. +" st1 {v10.s}[2],[x27],x14 \n\t" // Store c102 into quad and increment by rs_c. +" st1 {v10.s}[3],[x27],x14 \n\t" // Store c103 into quad and increment by rs_c. +" st1 {v11.s}[0],[x27],x14 \n\t" // Store c104 into quad and increment by rs_c. +" st1 {v11.s}[1],[x27],x14 \n\t" // Store c105 into quad and increment by rs_c. +" st1 {v11.s}[2],[x27],x14 \n\t" // Store c106 into quad and increment by rs_c. +" st1 {v11.s}[3],[x27],x14 \n\t" // Store c107 into quad and increment by rs_c. +" \n\t" +" mov x27, x26 \n\t" +" \n\t" +" st1 {v12.s}[0],[x27],x14 \n\t" // Store c110 into quad and increment by rs_c. +" st1 {v12.s}[1],[x27],x14 \n\t" // Store c111 into quad and increment by rs_c. +" st1 {v12.s}[2],[x27],x14 \n\t" // Store c112 into quad and increment by rs_c. +" st1 {v12.s}[3],[x27],x14 \n\t" // Store c113 into quad and increment by rs_c. +" st1 {v13.s}[0],[x27],x14 \n\t" // Store c114 into quad and increment by rs_c. +" st1 {v13.s}[1],[x27],x14 \n\t" // Store c115 into quad and increment by rs_c. +" st1 {v13.s}[2],[x27],x14 \n\t" // Store c116 into quad and increment by rs_c. +" st1 {v13.s}[3],[x27],x14 \n\t" // Store c147 into quad and increment by rs_c. " \n\t" " .SEND: \n\t" // Done! " \n\t" @@ -410,10 +1060,13 @@ __asm__ volatile [b_next] "m" (b_next), // 10 [k] "m" (k) // 11 :// Register clobber list - "x0", "x1", "x2", "x4", + "x0", "x1", "x2","x3","x4", "x5", "x6", "x7", "x8", "x9", "x10","x11","x12", - "x13","x14","x20", + "x13","x14","x15", + "x16","x17","x18","x19", + "x20","x21","x22","x23", + "x24","x25","x26","x27", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10","v11", @@ -421,17 +1074,32 @@ __asm__ volatile "v16","v17","v18","v19", "v20","v21","v22","v23", "v24","v25","v26","v27", - "v30","v31" + "v28","v29","v30","v31" ); } /* + o 4x4 Double precision micro-kernel NOT fully functional yet. + o Runnable on ARMv8, compiled with aarch64 GCC. + o Use it together with the armv8 BLIS configuration. + o Tested on Juno board. Around 3 GFLOPS @ 1.1 GHz. + + December 2014. + + * UPDATE OCTOBER 2015: Now is fully functional. * Tested on Juno board. Around 5.6 GFLOPS, 2 A57 cores @ 1.1 GHz. * Tested on Juno board. Around 4 GFLOPS, 4 A53 cores @ 850 MHz. + + * UPDATE NOVEMBER 2015 + * Micro-kernel changed to 6x8 + * Tested on Juno Board. Around 4 GFLOPS, 1 x A57 core @ 1.1 GHz. + * Tested on Juno Board. Around 7.6 GFLOPS, 2 x A57 cores @ 1.1 GHz. + * Tested on Juno board. Around 1.5 GFLOPS, 1 x A53 core @ 850 MHz. + * Tested on Juno board. Around 5.5 GFLOPS, 4 x A53 cores @ 850 MHz. */ -void bli_dgemm_opt_4x4( +void bli_dgemm_opt_6x8( dim_t k, double* restrict alpha, double* restrict a, @@ -444,8 +1112,8 @@ void bli_dgemm_opt_4x4( void* a_next = bli_auxinfo_next_a( data ); void* b_next = bli_auxinfo_next_b( data ); - dim_t k_iter = k / 2; - dim_t k_left = k % 2; + dim_t k_iter = k / 4; + dim_t k_left = k % 4; __asm__ volatile ( @@ -454,10 +1122,8 @@ __asm__ volatile " ldr x1,%[baddr] \n\t" // Load address of B " ldr x2,%[caddr] \n\t" // Load address of C " \n\t" -" mov x4,#0 \n\t" // Init loop counter (i=0) -" \n\t" -" ldr x16,%[a_next] \n\t" // Move pointer -" ldr x17,%[b_next] \n\t" // Move pointer +" ldr x3,%[a_next] \n\t" // Move pointer +" ldr x4,%[b_next] \n\t" // Move pointer " \n\t" " ldr x5,%[k_iter] \n\t" // Init guard (k_iter) " ldr x6,%[k_left] \n\t" // Init guard (k_iter) @@ -467,123 +1133,414 @@ __asm__ volatile " \n\t" " ldr x9,%[cs_c] \n\t" // Load cs_c " lsl x10,x9,#3 \n\t" // cs_c * sizeof(double) -" lsl x11,x9,#4 \n\t" // 2 * cs_c * sizeof(double) -- AUX. -" lsl x12,x9,#5 \n\t" // 3 * cs_c * sizeof(double) -- AUX. " \n\t" " ldr x13,%[rs_c] \n\t" // Load rs_c. " lsl x14,x13,#3 \n\t" // rs_c * sizeof(double). " \n\t" -" prfm pldl1keep,[x2,0] \n\t" // Prefetch c. -" prfm pldl1keep,[x2,x10] \n\t" // Prefetch c. -" prfm pldl1keep,[x2,x11] \n\t" // Prefetch c. -" prfm pldl1keep,[x2,x12] \n\t" // Prefetch c. +" add x20,x2,x10 \n\t" //Load address Column 1 of C +" add x21,x20,x10 \n\t" //Load address Column 2 of C +" add x22,x21,x10 \n\t" //Load address Column 3 of C +" add x23,x22,x10 \n\t" //Load address Column 4 of C +" add x24,x23,x10 \n\t" //Load address Column 5 of C +" add x25,x24,x10 \n\t" //Load address Column 6 of C +" add x26,x25,x10 \n\t" //Load address Column 7 of C " \n\t" -" movi v12.2d,#0 \n\t" // Vector for accummulating column 0 -" movi v13.2d,#0 \n\t" // Vector for accummulating column 0 -" movi v14.2d,#0 \n\t" // Vector for accummulating column 1 -" movi v15.2d,#0 \n\t" // Vector for accummulating column 1 -" movi v16.2d,#0 \n\t" // Vector for accummulating column 2 -" movi v17.2d,#0 \n\t" // Vector for accummulating column 2 -" movi v18.2d,#0 \n\t" // Vector for accummulating column 3 -" movi v19.2d,#0 \n\t" // Vector for accummulating column 3 +" prfm pldl1keep,[x2] \n\t" // Prefetch c. +" prfm pldl1keep,[x20] \n\t" // Prefetch c. +" prfm pldl1keep,[x21] \n\t" // Prefetch c. +" prfm pldl1keep,[x22] \n\t" // Prefetch c. +" prfm pldl1keep,[x23] \n\t" // Prefetch c. +" prfm pldl1keep,[x24] \n\t" // Prefetch c. +" prfm pldl1keep,[x25] \n\t" // Prefetch c. +" prfm pldl1keep,[x26] \n\t" // Prefetch c. " \n\t" -" movi v20.2d,#0 \n\t" // Vector for accummulating column 0 -" movi v21.2d,#0 \n\t" // Vector for accummulating column 0 -" movi v22.2d,#0 \n\t" // Vector for accummulating column 1 -" movi v23.2d,#0 \n\t" // Vector for accummulating column 1 -" movi v24.2d,#0 \n\t" // Vector for accummulating column 2 -" movi v25.2d,#0 \n\t" // Vector for accummulating column 2 -" movi v26.2d,#0 \n\t" // Vector for accummulating column 3 -" movi v27.2d,#0 \n\t" // Vector for accummulating column 3 +" ldr q0, [x0] \n\t" +" ldr q1, [x0, #16] \n\t" // Load a +" ldr q2, [x0, #32] \n\t" " \n\t" -" ld1r {v31.2d},[x8] \n\t" // Load beta +" ldr q3, [x1] \n\t" // Load b +" ldr q4, [x1, #16] \n\t" +" ldr q5, [x1, #32] \n\t" +" ldr q6, [x1, #48] \n\t" +" \n\t" +" dup v8.2d, xzr \n\t" // Vector for accummulating column 0 +" prfm PLDL1KEEP, [x1, #256] \n\t" +" dup v9.2d, xzr \n\t" // Vector for accummulating column 0 +" prfm PLDL1KEEP, [x1, #320] \n\t" +" dup v10.2d, xzr \n\t" // Vector for accummulating column 0 +" prfm PLDL1KEEP, [x1, #384] \n\t" +" dup v11.2d, xzr \n\t" // Vector for accummulating column 1 +" prfm PLDL1KEEP, [x1, #448] \n\t" +" dup v12.2d, xzr \n\t" // Vector for accummulating column 1 +" dup v13.2d, xzr \n\t" // Vector for accummulating column 1 +" \n\t" +" dup v14.2d, xzr \n\t" // Vector for accummulating column 2 +" prfm PLDL1KEEP, [x0, #192] \n\t" +" dup v15.2d, xzr \n\t" // Vector for accummulating column 2 +" prfm PLDL1KEEP, [x0, #256] \n\t" +" dup v16.2d, xzr \n\t" // Vector for accummulating column 2 +" prfm PLDL1KEEP, [x0, #320] \n\t" +" dup v17.2d, xzr \n\t" // Vector for accummulating column 3 +" dup v18.2d, xzr \n\t" // Vector for accummulating column 3 +" dup v19.2d, xzr \n\t" // Vector for accummulating column 3 +" \n\t" +" dup v20.2d, xzr \n\t" // Vector for accummulating column 4 +" dup v21.2d, xzr \n\t" // Vector for accummulating column 4 +" dup v22.2d, xzr \n\t" // Vector for accummulating column 4 +" dup v23.2d, xzr \n\t" // Vector for accummulating column 5 +" dup v24.2d, xzr \n\t" // Vector for accummulating column 5 +" dup v25.2d, xzr \n\t" // Vector for accummulating column 5 +" \n\t" +" dup v26.2d, xzr \n\t" // Vector for accummulating column 6 +" dup v27.2d, xzr \n\t" // Vector for accummulating column 6 +" dup v28.2d, xzr \n\t" // Vector for accummulating column 6 +" dup v29.2d, xzr \n\t" // Vector for accummulating column 7 +" dup v30.2d, xzr \n\t" // Vector for accummulating column 7 +" dup v31.2d, xzr \n\t" // Vector for accummulating column 7 " \n\t" " \n\t" " cmp x5,#0 \n\t" // If k_iter == 0, jump to k_left. " beq .DCONSIDERKLEFT \n\t" " \n\t" -" ldp q0,q1,[x0],32 \n\t" // Load a -" ldp q4,q5,[x1],32 \n\t" // Load b +"add x0, x0, #48 \n\t" //update address of A +"add x1, x1, #64 \n\t" //update address of B " \n\t" " cmp x5,1 \n\t" // If there is just one k_iter, jump to that one. " beq .DLASTITER \n\t" // (as loop is do-while-like). -" \n\t" " \n\t" " DLOOP: \n\t" // Body " \n\t" -" prfm pldl1keep,[x0,#1024] \n\t" // Prefetch. -" prfm pldl1keep,[x1,#1024] \n\t" // Prefetch. +" fmla v8.2d ,v0.2d,v3.d[0] \n\t" // Accummulate +" prfm PLDL1KEEP, [x1, #448] \n\t" //512-64=448 +" fmla v9.2d ,v1.2d,v3.d[0] \n\t" // Accummulate +" prfm PLDL1KEEP, [x1, #512] \n\t" +" fmla v10.2d,v2.2d,v3.d[0] \n\t" // Accummulate +" prfm PLDL1KEEP, [x1, #576] \n\t" " \n\t" -" ldp q6,q7,[x1],32 \n\t" // Load b+4 into quad +" fmla v11.2d,v0.2d,v3.d[1] \n\t" // Accummulate +" fmla v12.2d,v1.2d,v3.d[1] \n\t" // Accummulate +" fmla v13.2d,v2.2d,v3.d[1] \n\t" // Accummulate " \n\t" -" fmla v12.2d,v0.2d,v4.d[0] \n\t" // Accummulate -" fmla v14.2d,v0.2d,v4.d[1] \n\t" // Accummulate +" fmla v14.2d,v0.2d,v4.d[0] \n\t" // Accummulate +" fmla v15.2d,v1.2d,v4.d[0] \n\t" // Accummulate +" fmla v16.2d,v2.2d,v4.d[0] \n\t" // Accummulate +" ldr q3, [x1] \n\t" " \n\t" -" fmla v13.2d,v1.2d,v4.d[0] \n\t" // Accummulate -" fmla v15.2d,v1.2d,v4.d[1] \n\t" // Accummulate +" fmla v17.2d,v0.2d,v4.d[1] \n\t" // Accummulate +" fmla v18.2d,v1.2d,v4.d[1] \n\t" // Accummulate +" fmla v19.2d,v2.2d,v4.d[1] \n\t" // Accummulate +" ldr q7, [x0, #32] \n\t" " \n\t" -" ldp q2,q3,[x0],32 \n\t" // Load a+4 into quad +" fmla v20.2d,v0.2d,v5.d[0] \n\t" // Accummulate +" fmla v21.2d,v1.2d,v5.d[0] \n\t" // Accummulate +" fmla v22.2d,v2.2d,v5.d[0] \n\t" // Accummulate +" ldr q4, [x1, #16] \n\t" " \n\t" -" fmla v16.2d,v0.2d,v5.d[0] \n\t" // Accummulate -" fmla v18.2d,v0.2d,v5.d[1] \n\t" // Accummulate +" fmla v23.2d,v0.2d,v5.d[1] \n\t" // Accummulate +" fmla v24.2d,v1.2d,v5.d[1] \n\t" // Accummulate +" fmla v25.2d,v2.2d,v5.d[1] \n\t" // Accummulate +" ldr q5, [x1, #32] \n\t" " \n\t" -" fmla v17.2d,v1.2d,v5.d[0] \n\t" // Accummulate -" fmla v19.2d,v1.2d,v5.d[1] \n\t" // Accummulate +" fmla v26.2d,v0.2d,v6.d[0] \n\t" // Accummulate +" fmla v29.2d,v0.2d,v6.d[1] \n\t" // Accummulate +" ldr q0, [x0] \n\t" " \n\t" -" ldp q0,q1,[x0],32 \n\t" // Load a into quad +" fmla v27.2d,v1.2d,v6.d[0] \n\t" // Accummulate +" fmla v30.2d,v1.2d,v6.d[1] \n\t" // Accummulate +" ldr q1, [x0, #16] \n\t" " \n\t" -" fmla v12.2d,v2.2d,v6.d[0] \n\t" // Accummulate -" fmla v14.2d,v2.2d,v6.d[1] \n\t" // Accummulate +" fmla v28.2d,v2.2d,v6.d[0] \n\t" // Accummulate +" fmla v31.2d,v2.2d,v6.d[1] \n\t" // Accummulate +" ldr q6, [x1, #48] \n\t" +" \n\t" // End it 1 +" fmla v8.2d ,v0.2d,v3.d[0] \n\t" // Accummulate +" prfm PLDL1KEEP, [x1, #640] \n\t" +" fmla v9.2d ,v1.2d,v3.d[0] \n\t" // Accummulate +" prfm PLDL1KEEP, [x0, #336] \n\t" +" fmla v10.2d,v7.2d,v3.d[0] \n\t" // Accummulate +" prfm PLDL1KEEP, [x0, #400] \n\t" " \n\t" -" fmla v13.2d,v3.2d,v6.d[0] \n\t" // Accummulate -" fmla v15.2d,v3.2d,v6.d[1] \n\t" // Accummulate +" fmla v11.2d,v0.2d,v3.d[1] \n\t" // Accummulate +" fmla v12.2d,v1.2d,v3.d[1] \n\t" // Accummulate +" fmla v13.2d,v7.2d,v3.d[1] \n\t" // Accummulate " \n\t" -" ldp q4,q5,[x1],32 \n\t" // Load b into quad +" fmla v14.2d,v0.2d,v4.d[0] \n\t" // Accummulate +" fmla v15.2d,v1.2d,v4.d[0] \n\t" // Accummulate +" fmla v16.2d,v7.2d,v4.d[0] \n\t" // Accummulate +" ldr q3, [x1, #64] \n\t" " \n\t" -" fmla v16.2d,v2.2d,v7.d[0] \n\t" // Accummulate -" fmla v18.2d,v2.2d,v7.d[1] \n\t" // Accummulate +" fmla v17.2d,v0.2d,v4.d[1] \n\t" // Accummulate +" fmla v18.2d,v1.2d,v4.d[1] \n\t" // Accummulate +" fmla v19.2d,v7.2d,v4.d[1] \n\t" // Accummulate +" ldr q2, [x0, #80] \n\t" " \n\t" -" fmla v17.2d,v3.2d,v7.d[0] \n\t" // Accummulate -" fmla v19.2d,v3.2d,v7.d[1] \n\t" // Accummulate +" fmla v20.2d,v0.2d,v5.d[0] \n\t" // Accummulate +" fmla v21.2d,v1.2d,v5.d[0] \n\t" // Accummulate +" fmla v22.2d,v7.2d,v5.d[0] \n\t" // Accummulate +" ldr q4, [x1, #80] \n\t" " \n\t" -" prfm pldl1keep,[x0,#64] \n\t" // Prefetch. -" prfm pldl1keep,[x1,#64] \n\t" // Prefetch. +" fmla v23.2d,v0.2d,v5.d[1] \n\t" // Accummulate +" fmla v24.2d,v1.2d,v5.d[1] \n\t" // Accummulate +" fmla v25.2d,v7.2d,v5.d[1] \n\t" // Accummulate +" ldr q5, [x1, #96] \n\t" +" \n\t" +" fmla v26.2d,v0.2d,v6.d[0] \n\t" // Accummulate +" fmla v29.2d,v0.2d,v6.d[1] \n\t" // Accummulate +" ldr q0, [x0, #48] \n\t" +" \n\t" +" fmla v27.2d,v1.2d,v6.d[0] \n\t" // Accummulate +" fmla v30.2d,v1.2d,v6.d[1] \n\t" // Accummulate +" ldr q1, [x0, #64] \n\t" +" \n\t" +" fmla v28.2d,v7.2d,v6.d[0] \n\t" // Accummulate +" fmla v31.2d,v7.2d,v6.d[1] \n\t" // Accummulate +" ldr q6, [x1, #112] \n\t" +" \n\t" //End it 2 +" fmla v8.2d ,v0.2d,v3.d[0] \n\t" // Accummulate +" prfm PLDL1KEEP, [x0, #464] \n\t" +" fmla v9.2d ,v1.2d,v3.d[0] \n\t" // Accummulate +" fmla v10.2d,v2.2d,v3.d[0] \n\t" // Accummulate +" \n\t" +" fmla v11.2d,v0.2d,v3.d[1] \n\t" // Accummulate +" fmla v12.2d,v1.2d,v3.d[1] \n\t" // Accummulate +" fmla v13.2d,v2.2d,v3.d[1] \n\t" // Accummulate +" \n\t" +" fmla v14.2d,v0.2d,v4.d[0] \n\t" // Accummulate +" fmla v15.2d,v1.2d,v4.d[0] \n\t" // Accummulate +" fmla v16.2d,v2.2d,v4.d[0] \n\t" // Accummulate +" ldr q3, [x1, #128] \n\t" +" \n\t" +" fmla v17.2d,v0.2d,v4.d[1] \n\t" // Accummulate +" fmla v18.2d,v1.2d,v4.d[1] \n\t" // Accummulate +" fmla v19.2d,v2.2d,v4.d[1] \n\t" // Accummulate +" ldr q7, [x0, #128] \n\t" +" \n\t" +" fmla v20.2d,v0.2d,v5.d[0] \n\t" // Accummulate +" fmla v21.2d,v1.2d,v5.d[0] \n\t" // Accummulate +" fmla v22.2d,v2.2d,v5.d[0] \n\t" // Accummulate +" ldr q4, [x1, #144] \n\t" +" \n\t" +" fmla v23.2d,v0.2d,v5.d[1] \n\t" // Accummulate +" fmla v24.2d,v1.2d,v5.d[1] \n\t" // Accummulate +" fmla v25.2d,v2.2d,v5.d[1] \n\t" // Accummulate +" ldr q5, [x1, #160] \n\t" +" \n\t" +" fmla v26.2d,v0.2d,v6.d[0] \n\t" // Accummulate +" fmla v29.2d,v0.2d,v6.d[1] \n\t" // Accummulate +" ldr q0, [x0, #96] \n\t" +" \n\t" +" fmla v27.2d,v1.2d,v6.d[0] \n\t" // Accummulate +" fmla v30.2d,v1.2d,v6.d[1] \n\t" // Accummulate +" ldr q1, [x0, #112] \n\t" +" \n\t" +" fmla v28.2d,v2.2d,v6.d[0] \n\t" // Accummulate +" fmla v31.2d,v2.2d,v6.d[1] \n\t" // Accummulate +" ldr q6, [x1, #176] \n\t" +" \n\t" // End it 3 +" fmla v8.2d ,v0.2d,v3.d[0] \n\t" // Accummulate +" fmla v9.2d ,v1.2d,v3.d[0] \n\t" // Accummulate +" fmla v10.2d,v7.2d,v3.d[0] \n\t" // Accummulate +" \n\t" +" fmla v11.2d,v0.2d,v3.d[1] \n\t" // Accummulate +" fmla v12.2d,v1.2d,v3.d[1] \n\t" // Accummulate +" fmla v13.2d,v7.2d,v3.d[1] \n\t" // Accummulate +" ldr q3, [x1, #192] \n\t" +" \n\t" +" fmla v14.2d,v0.2d,v4.d[0] \n\t" // Accummulate +" fmla v15.2d,v1.2d,v4.d[0] \n\t" // Accummulate +" fmla v16.2d,v7.2d,v4.d[0] \n\t" // Accummulate +" ldr q2, [x0, #176] \n\t" +" \n\t" +" fmla v17.2d,v0.2d,v4.d[1] \n\t" // Accummulate +" fmla v18.2d,v1.2d,v4.d[1] \n\t" // Accummulate +" fmla v19.2d,v7.2d,v4.d[1] \n\t" // Accummulate +" ldr q4, [x1, #208] \n\t" +" \n\t" +" fmla v20.2d,v0.2d,v5.d[0] \n\t" // Accummulate +" fmla v21.2d,v1.2d,v5.d[0] \n\t" // Accummulate +" fmla v22.2d,v7.2d,v5.d[0] \n\t" // Accummulate +" \n\t" +" fmla v23.2d,v0.2d,v5.d[1] \n\t" // Accummulate +" fmla v24.2d,v1.2d,v5.d[1] \n\t" // Accummulate +" fmla v25.2d,v7.2d,v5.d[1] \n\t" // Accummulate +" ldr q5, [x1, #224] \n\t" +" \n\t" +" fmla v26.2d,v0.2d,v6.d[0] \n\t" // Accummulate +" fmla v29.2d,v0.2d,v6.d[1] \n\t" // Accummulate +" ldr q0, [x0, #144] \n\t" +" \n\t" +" fmla v27.2d,v1.2d,v6.d[0] \n\t" // Accummulate +" fmla v30.2d,v1.2d,v6.d[1] \n\t" // Accummulate +" ldr q1, [x0, #160] \n\t" +" \n\t" +" fmla v28.2d,v7.2d,v6.d[0] \n\t" // Accummulate +" fmla v31.2d,v7.2d,v6.d[1] \n\t" // Accummulate +" ldr q6, [x1, #240] \n\t" +" \n\t" //End it 4 +" add x0, x0, #192 \n\t" +" add x1, x1, #256 \n\t" " \n\t" " sub x5,x5,1 \n\t" // i-=1 " cmp x5,1 \n\t" // Iterate again if we are not in k_iter == 1. -"bne DLOOP \n\t" +" bne DLOOP \n\t" " \n\t" ".DLASTITER: \n\t" " \n\t" -" ldp q6,q7,[x1],32 \n\t" // Load b+4 into quad +" fmla v8.2d ,v0.2d,v3.d[0] \n\t" // Accummulate +" fmla v9.2d ,v1.2d,v3.d[0] \n\t" // Accummulate +" fmla v10.2d,v2.2d,v3.d[0] \n\t" // Accummulate " \n\t" -" fmla v12.2d,v0.2d,v4.d[0] \n\t" // Accummulate -" fmla v14.2d,v0.2d,v4.d[1] \n\t" // Accummulate +" fmla v11.2d,v0.2d,v3.d[1] \n\t" // Accummulate +" fmla v12.2d,v1.2d,v3.d[1] \n\t" // Accummulate +" fmla v13.2d,v2.2d,v3.d[1] \n\t" // Accummulate +" ldr q3, [x1] \n\t" " \n\t" -" fmla v13.2d,v1.2d,v4.d[0] \n\t" // Accummulate -" fmla v15.2d,v1.2d,v4.d[1] \n\t" // Accummulate +" fmla v14.2d,v0.2d,v4.d[0] \n\t" // Accummulate +" fmla v15.2d,v1.2d,v4.d[0] \n\t" // Accummulate +" fmla v16.2d,v2.2d,v4.d[0] \n\t" // Accummulate +" ldr q7, [x0, #32] \n\t" " \n\t" -" ldp q2,q3,[x0],32 \n\t" // Load a+4 into quad +" fmla v17.2d,v0.2d,v4.d[1] \n\t" // Accummulate +" fmla v18.2d,v1.2d,v4.d[1] \n\t" // Accummulate +" fmla v19.2d,v2.2d,v4.d[1] \n\t" // Accummulate +" ldr q4, [x1, #16] \n\t" " \n\t" -" fmla v16.2d,v0.2d,v5.d[0] \n\t" // Accummulate -" fmla v18.2d,v0.2d,v5.d[1] \n\t" // Accummulate +" fmla v20.2d,v0.2d,v5.d[0] \n\t" // Accummulate +" fmla v21.2d,v1.2d,v5.d[0] \n\t" // Accummulate +" fmla v22.2d,v2.2d,v5.d[0] \n\t" // Accummulate " \n\t" -" fmla v17.2d,v1.2d,v5.d[0] \n\t" // Accummulate -" fmla v19.2d,v1.2d,v5.d[1] \n\t" // Accummulate +" fmla v23.2d,v0.2d,v5.d[1] \n\t" // Accummulate +" fmla v24.2d,v1.2d,v5.d[1] \n\t" // Accummulate +" fmla v25.2d,v2.2d,v5.d[1] \n\t" // Accummulate +" ldr q5, [x1, #32] \n\t" " \n\t" -" ld1r {v30.2d},[x7] \n\t" // Load alpha. +" fmla v26.2d,v0.2d,v6.d[0] \n\t" // Accummulate +" fmla v29.2d,v0.2d,v6.d[1] \n\t" // Accummulate +" ldr q0, [x0] \n\t" " \n\t" -" fmla v12.2d,v2.2d,v6.d[0] \n\t" // Accummulate -" fmla v14.2d,v2.2d,v6.d[1] \n\t" // Accummulate +" fmla v27.2d,v1.2d,v6.d[0] \n\t" // Accummulate +" fmla v30.2d,v1.2d,v6.d[1] \n\t" // Accummulate +" ldr q1, [x0, #16] \n\t" " \n\t" -" fmla v13.2d,v3.2d,v6.d[0] \n\t" // Accummulate -" fmla v15.2d,v3.2d,v6.d[1] \n\t" // Accummulate +" fmla v28.2d,v2.2d,v6.d[0] \n\t" // Accummulate +" fmla v31.2d,v2.2d,v6.d[1] \n\t" // Accummulate +" ldr q6, [x1, #48] \n\t" +" \n\t" // End it 1 +" fmla v8.2d ,v0.2d,v3.d[0] \n\t" // Accummulate +" fmla v9.2d ,v1.2d,v3.d[0] \n\t" // Accummulate +" fmla v10.2d,v7.2d,v3.d[0] \n\t" // Accummulate " \n\t" -" fmla v16.2d,v2.2d,v7.d[0] \n\t" // Accummulate -" fmla v18.2d,v2.2d,v7.d[1] \n\t" // Accummulate +" fmla v11.2d,v0.2d,v3.d[1] \n\t" // Accummulate +" fmla v12.2d,v1.2d,v3.d[1] \n\t" // Accummulate +" fmla v13.2d,v7.2d,v3.d[1] \n\t" // Accummulate +" ldr q3, [x1, #64] \n\t" " \n\t" -" fmla v17.2d,v3.2d,v7.d[0] \n\t" // Accummulate -" fmla v19.2d,v3.2d,v7.d[1] \n\t" // Accummulate +" fmla v14.2d,v0.2d,v4.d[0] \n\t" // Accummulate +" fmla v15.2d,v1.2d,v4.d[0] \n\t" // Accummulate +" fmla v16.2d,v7.2d,v4.d[0] \n\t" // Accummulate +" ldr q2, [x0, #80] \n\t" +" \n\t" +" fmla v17.2d,v0.2d,v4.d[1] \n\t" // Accummulate +" fmla v18.2d,v1.2d,v4.d[1] \n\t" // Accummulate +" fmla v19.2d,v7.2d,v4.d[1] \n\t" // Accummulate +" ldr q4, [x1, #80] \n\t" +" \n\t" +" fmla v20.2d,v0.2d,v5.d[0] \n\t" // Accummulate +" fmla v21.2d,v1.2d,v5.d[0] \n\t" // Accummulate +" fmla v22.2d,v7.2d,v5.d[0] \n\t" // Accummulate +" \n\t" +" fmla v23.2d,v0.2d,v5.d[1] \n\t" // Accummulate +" fmla v24.2d,v1.2d,v5.d[1] \n\t" // Accummulate +" fmla v25.2d,v7.2d,v5.d[1] \n\t" // Accummulate +" ldr q5, [x1, #96] \n\t" +" \n\t" +" fmla v26.2d,v0.2d,v6.d[0] \n\t" // Accummulate +" fmla v29.2d,v0.2d,v6.d[1] \n\t" // Accummulate +" ldr q0, [x0, #48] \n\t" +" \n\t" +" fmla v27.2d,v1.2d,v6.d[0] \n\t" // Accummulate +" fmla v30.2d,v1.2d,v6.d[1] \n\t" // Accummulate +" ldr q1, [x0, #64] \n\t" +" \n\t" +" fmla v28.2d,v7.2d,v6.d[0] \n\t" // Accummulate +" fmla v31.2d,v7.2d,v6.d[1] \n\t" // Accummulate +" ldr q6, [x1, #112] \n\t" +" \n\t" //End it 2 +" fmla v8.2d ,v0.2d,v3.d[0] \n\t" // Accummulate +" fmla v9.2d ,v1.2d,v3.d[0] \n\t" // Accummulate +" fmla v10.2d,v2.2d,v3.d[0] \n\t" // Accummulate +" \n\t" +" fmla v11.2d,v0.2d,v3.d[1] \n\t" // Accummulate +" fmla v12.2d,v1.2d,v3.d[1] \n\t" // Accummulate +" fmla v13.2d,v2.2d,v3.d[1] \n\t" // Accummulate +" ldr q3, [x1, #128] \n\t" +" \n\t" +" fmla v14.2d,v0.2d,v4.d[0] \n\t" // Accummulate +" fmla v15.2d,v1.2d,v4.d[0] \n\t" // Accummulate +" fmla v16.2d,v2.2d,v4.d[0] \n\t" // Accummulate +" ldr q7, [x0, #128] \n\t" +" \n\t" +" fmla v17.2d,v0.2d,v4.d[1] \n\t" // Accummulate +" fmla v18.2d,v1.2d,v4.d[1] \n\t" // Accummulate +" fmla v19.2d,v2.2d,v4.d[1] \n\t" // Accummulate +" ldr q4, [x1, #144] \n\t" +" \n\t" +" fmla v20.2d,v0.2d,v5.d[0] \n\t" // Accummulate +" fmla v21.2d,v1.2d,v5.d[0] \n\t" // Accummulate +" fmla v22.2d,v2.2d,v5.d[0] \n\t" // Accummulate +" \n\t" +" fmla v23.2d,v0.2d,v5.d[1] \n\t" // Accummulate +" fmla v24.2d,v1.2d,v5.d[1] \n\t" // Accummulate +" fmla v25.2d,v2.2d,v5.d[1] \n\t" // Accummulate +" ldr q5, [x1, #160] \n\t" +" \n\t" +" fmla v26.2d,v0.2d,v6.d[0] \n\t" // Accummulate +" fmla v29.2d,v0.2d,v6.d[1] \n\t" // Accummulate +" ldr q0, [x0, #96] \n\t" +" \n\t" +" fmla v27.2d,v1.2d,v6.d[0] \n\t" // Accummulate +" fmla v30.2d,v1.2d,v6.d[1] \n\t" // Accummulate +" ldr q1, [x0, #112] \n\t" +" \n\t" +" fmla v28.2d,v2.2d,v6.d[0] \n\t" // Accummulate +" fmla v31.2d,v2.2d,v6.d[1] \n\t" // Accummulate +" ldr q6, [x1, #176] \n\t" +" \n\t" // End it 3 +" fmla v8.2d ,v0.2d,v3.d[0] \n\t" // Accummulate +" fmla v9.2d ,v1.2d,v3.d[0] \n\t" // Accummulate +" fmla v10.2d,v7.2d,v3.d[0] \n\t" // Accummulate +" \n\t" +" fmla v11.2d,v0.2d,v3.d[1] \n\t" // Accummulate +" fmla v12.2d,v1.2d,v3.d[1] \n\t" // Accummulate +" fmla v13.2d,v7.2d,v3.d[1] \n\t" // Accummulate +" \n\t" +" fmla v14.2d,v0.2d,v4.d[0] \n\t" // Accummulate +" fmla v15.2d,v1.2d,v4.d[0] \n\t" // Accummulate +" fmla v16.2d,v7.2d,v4.d[0] \n\t" // Accummulate +" \n\t" +" fmla v17.2d,v0.2d,v4.d[1] \n\t" // Accummulate +" fmla v18.2d,v1.2d,v4.d[1] \n\t" // Accummulate +" fmla v19.2d,v7.2d,v4.d[1] \n\t" // Accummulate +" \n\t" +" fmla v20.2d,v0.2d,v5.d[0] \n\t" // Accummulate +" fmla v21.2d,v1.2d,v5.d[0] \n\t" // Accummulate +" fmla v22.2d,v7.2d,v5.d[0] \n\t" // Accummulate +" \n\t" +" fmla v23.2d,v0.2d,v5.d[1] \n\t" // Accummulate +" fmla v24.2d,v1.2d,v5.d[1] \n\t" // Accummulate +" fmla v25.2d,v7.2d,v5.d[1] \n\t" // Accummulate +" \n\t" +" fmla v26.2d,v0.2d,v6.d[0] \n\t" // Accummulate +" add x1, x1, #192 \n\t" +" fmla v29.2d,v0.2d,v6.d[1] \n\t" // Accummulate +" \n\t" +" fmla v27.2d,v1.2d,v6.d[0] \n\t" // Accummulate +" fmla v30.2d,v1.2d,v6.d[1] \n\t" // Accummulate +" \n\t" +" fmla v28.2d,v7.2d,v6.d[0] \n\t" // Accummulate +" fmla v31.2d,v7.2d,v6.d[1] \n\t" // Accummulate +" \n\t" //End it 4 +" add x0, x0, #144 \n\t" " \n\t" " .DCONSIDERKLEFT: \n\t" " cmp x6,0 \n\t" // If k_left == 0, we are done. @@ -591,182 +1548,488 @@ __asm__ volatile " \n\t" ".DLOOPKLEFT: \n\t" " \n\t" -" prfm pldl1keep,[x0,#1024] \n\t" // Prefetch. -" prfm pldl1keep,[x1,#1024] \n\t" // Prefetch. +" ldr q0, [x0],#16 \n\t" +" ldr q1, [x0],#16 \n\t" // Load a +" ldr q2, [x0],#16 \n\t" " \n\t" -" ldp q0,q1,[x0],32 \n\t" // Load a into quad -" ldp q4,q5,[x1],32 \n\t" // Load b into quad +" ldr q3, [x1],#16 \n\t" // Load b +" ldr q4, [x1],#16 \n\t" +" ldr q5, [x1],#16 \n\t" +" ldr q6, [x1],#16 \n\t" " \n\t" -//" sub x6,x6,1 \n\t" +" sub x6,x6,1 \n\t" " \n\t" -" fmla v12.2d,v0.2d,v4.d[0] \n\t" // Accummulate -" fmla v14.2d,v0.2d,v4.d[1] \n\t" // Accummulate +" fmla v8.2d ,v0.2d,v3.d[0] \n\t" // Accummulate +" fmla v9.2d ,v1.2d,v3.d[0] \n\t" // Accummulate +" fmla v10.2d,v2.2d,v3.d[0] \n\t" // Accummulate " \n\t" -" fmla v13.2d,v1.2d,v4.d[0] \n\t" // Accummulate -" fmla v15.2d,v1.2d,v4.d[1] \n\t" // Accummulate +" fmla v11.2d,v0.2d,v3.d[1] \n\t" // Accummulate +" fmla v12.2d,v1.2d,v3.d[1] \n\t" // Accummulate +" fmla v13.2d,v2.2d,v3.d[1] \n\t" // Accummulate " \n\t" -" fmla v16.2d,v0.2d,v5.d[0] \n\t" // Accummulate -" fmla v18.2d,v0.2d,v5.d[1] \n\t" // Accummulate +" fmla v14.2d,v0.2d,v4.d[0] \n\t" // Accummulate +" fmla v15.2d,v1.2d,v4.d[0] \n\t" // Accummulate +" fmla v16.2d,v2.2d,v4.d[0] \n\t" // Accummulate " \n\t" -" fmla v17.2d,v1.2d,v5.d[0] \n\t" // Accummulate -" fmla v19.2d,v1.2d,v5.d[1] \n\t" // Accummulate +" fmla v17.2d,v0.2d,v4.d[1] \n\t" // Accummulate +" fmla v18.2d,v1.2d,v4.d[1] \n\t" // Accummulate +" fmla v19.2d,v2.2d,v4.d[1] \n\t" // Accummulate " \n\t" -//" cmp x6,0 \n\t" // Iterate again. -//" bne .DLOOPKLEFT \n\t" // if i!=0. +" fmla v20.2d,v0.2d,v5.d[0] \n\t" // Accummulate +" fmla v21.2d,v1.2d,v5.d[0] \n\t" // Accummulate +" fmla v22.2d,v2.2d,v5.d[0] \n\t" // Accummulate +" \n\t" +" fmla v23.2d,v0.2d,v5.d[1] \n\t" // Accummulate +" fmla v24.2d,v1.2d,v5.d[1] \n\t" // Accummulate +" fmla v25.2d,v2.2d,v5.d[1] \n\t" // Accummulate +" \n\t" +" fmla v26.2d,v0.2d,v6.d[0] \n\t" // Accummulate +" fmla v29.2d,v0.2d,v6.d[1] \n\t" // Accummulate +" \n\t" +" fmla v27.2d,v1.2d,v6.d[0] \n\t" // Accummulate +" fmla v30.2d,v1.2d,v6.d[1] \n\t" // Accummulate +" \n\t" +" fmla v28.2d,v2.2d,v6.d[0] \n\t" // Accummulate +" fmla v31.2d,v2.2d,v6.d[1] \n\t" // Accummulate +" \n\t" +" cmp x6,0 \n\t" // Iterate again. +" bne .DLOOPKLEFT \n\t" // if i!=0. " \n\t" " .DPOSTACCUM: \n\t" -" ld1r {v30.2d},[x7] \n\t" // Load alpha. +" \n\t" +" ld1r {v6.2d},[x7] \n\t" // Load alpha. +" ld1r {v7.2d},[x8] \n\t" // Load beta " \n\t" " cmp x13,#1 \n\t" // If rs_c != 1 (column-major) " bne .DGENSTORED \n\t" " \n\t" " .DCOLSTORED: \n\t" // C is column-major. -" fcmp d31,#0.0 \n\t" -" beq .DBETAZEROCOLSTORED \n\t" // Taking care of the beta==0 case. " \n\t" -" ldr x2,%[caddr] \n\t" // Load address of C. +" dup v0.2d, xzr \n\t" +" dup v1.2d, xzr \n\t" +" dup v2.2d, xzr \n\t" +" dup v3.2d, xzr \n\t" +" dup v4.2d, xzr \n\t" +" dup v5.2d, xzr \n\t" " \n\t" +" fcmp d7,#0.0 \n\t" +" beq .DBETAZEROCOLSTOREDS1 \n\t" // Taking care of the beta==0 case. " \n\t" -" ldp q0,q1,[x2] \n\t" // Load c into quad and increment by cs_c -" add x2,x2,x10 \n\t" -" ldp q2,q3,[x2] \n\t" // Load c into quad and increment by cs_c -" add x2,x2,x10 \n\t" -" ldp q4,q5,[x2] \n\t" // Load c into quad and increment by cs_c -" add x2,x2,x10 \n\t" -" ldp q6,q7,[x2] \n\t" // Load c into quad and increment by cs_c +" ldr q0, [x2] \n\t" //Load column 0 of C +" ldr q1, [x2, #16] \n\t" +" ldr q2, [x2, #32] \n\t" " \n\t" +" ldr q3, [x20] \n\t" //Load column 1 of C +" ldr q4, [x20, #16] \n\t" +" ldr q5, [x20, #32] \n\t" " \n\t" -" fmul v20.2d,v0.2d,v31.d[0] \n\t" // Scale by beta -" fmul v21.2d,v1.2d,v31.d[0] \n\t" // Scale by beta -" fmul v22.2d,v2.2d,v31.d[0] \n\t" // Scale by beta -" fmul v23.2d,v3.2d,v31.d[0] \n\t" // Scale by beta -" fmul v24.2d,v4.2d,v31.d[0] \n\t" // Scale by beta -" fmul v25.2d,v5.2d,v31.d[0] \n\t" // Scale by beta -" fmul v26.2d,v6.2d,v31.d[0] \n\t" // Scale by beta -" fmul v27.2d,v7.2d,v31.d[0] \n\t" // Scale by beta +" fmul v0.2d,v0.2d,v7.d[0] \n\t" // Scale by beta +" fmul v1.2d,v1.2d,v7.d[0] \n\t" // Scale by beta +" fmul v2.2d,v2.2d,v7.d[0] \n\t" // Scale by beta +" fmul v3.2d,v3.2d,v7.d[0] \n\t" // Scale by beta +" fmul v4.2d,v4.2d,v7.d[0] \n\t" // Scale by beta +" fmul v5.2d,v5.2d,v7.d[0] \n\t" // Scale by beta " \n\t" -" prfm pldl2keep,[x16] \n\t" -" prfm pldl2keep,[x17] \n\t" +" .DBETAZEROCOLSTOREDS1: \n\t" " \n\t" -" .DBETAZEROCOLSTORED: \n\t" // If beta==0, we won't read from C (nor scale). +" fmla v0.2d,v8.2d,v6.d[0] \n\t" // Scale by alpha +" fmla v1.2d,v9.2d,v6.d[0] \n\t" // Scale by alpha +" fmla v2.2d,v10.2d,v6.d[0] \n\t" // Scale by alpha +" fmla v3.2d,v11.2d,v6.d[0] \n\t" // Scale by alpha +" fmla v4.2d,v12.2d,v6.d[0] \n\t" // Scale by alpha +" fmla v5.2d,v13.2d,v6.d[0] \n\t" // Scale by alpha " \n\t" -" ldr x2,%[caddr] \n\t" // Load address of C +" str q0, [x2] \n\t" //Store column 0 of C +" str q1, [x2, #16] \n\t" +" str q2, [x2, #32] \n\t" " \n\t" -" fmla v20.2d,v12.2d,v30.d[0] \n\t" // Scale by alpha -" fmla v21.2d,v13.2d,v30.d[0] \n\t" // Scale by alpha -" fmla v22.2d,v14.2d,v30.d[0] \n\t" // Scale by alpha -" fmla v23.2d,v15.2d,v30.d[0] \n\t" // Scale by alpha -" fmla v24.2d,v16.2d,v30.d[0] \n\t" // Scale by alpha -" fmla v25.2d,v17.2d,v30.d[0] \n\t" // Scale by alpha -" fmla v26.2d,v18.2d,v30.d[0] \n\t" // Scale by alpha -" fmla v27.2d,v19.2d,v30.d[0] \n\t" // Scale by alpha +" str q3, [x20] \n\t" //Store column 1 of C +" str q4, [x20, #16] \n\t" +" str q5, [x20, #32] \n\t" " \n\t" -" stp q20,q21,[x2] \n\t" // Store quad into c and increment by cs_c -" add x2,x2,x10 \n\t" -" stp q22,q23,[x2] \n\t" // Store quad into c+4 and increment by cs_c -" add x2,x2,x10 \n\t" -" stp q24,q25,[x2] \n\t" // Store quad into c+8 and increment by cs_c -" add x2,x2,x10 \n\t" -" stp q26,q27,[x2] \n\t" // Store quad into c+16 and increment by cs_c +" dup v8.2d, xzr \n\t" +" dup v9.2d, xzr \n\t" +" dup v10.2d, xzr \n\t" +" dup v11.2d, xzr \n\t" +" dup v12.2d, xzr \n\t" +" dup v13.2d, xzr \n\t" " \n\t" -" b .DEND \n\t" // Done (TODO: this obviously needs to be moved down to remove jump). +" fcmp d7,#0.0 \n\t" +" beq .DBETAZEROCOLSTOREDS2 \n\t" // Taking care of the beta==0 case. +" \n\t" +" ldr q8, [x21] \n\t" //Load column 2 of C +" ldr q9, [x21, #16] \n\t" +" ldr q10, [x21, #32] \n\t" +" \n\t" +" ldr q11, [x22] \n\t" //Load column 3 of C +" ldr q12, [x22, #16] \n\t" +" ldr q13, [x22, #32] \n\t" +" \n\t" +" fmul v8.2d, v8.2d, v7.d[0] \n\t" // Scale by beta +" fmul v9.2d, v9.2d, v7.d[0] \n\t" // Scale by beta +" fmul v10.2d,v10.2d,v7.d[0] \n\t" // Scale by beta +" fmul v11.2d,v11.2d,v7.d[0] \n\t" // Scale by beta +" fmul v12.2d,v12.2d,v7.d[0] \n\t" // Scale by beta +" fmul v13.2d,v13.2d,v7.d[0] \n\t" // Scale by beta +" \n\t" +" .DBETAZEROCOLSTOREDS2: \n\t" +" \n\t" +" fmla v8.2d, v14.2d,v6.d[0] \n\t" // Scale by alpha +" fmla v9.2d, v15.2d,v6.d[0] \n\t" // Scale by alpha +" fmla v10.2d,v16.2d,v6.d[0] \n\t" // Scale by alpha +" fmla v11.2d,v17.2d,v6.d[0] \n\t" // Scale by alpha +" fmla v12.2d,v18.2d,v6.d[0] \n\t" // Scale by alpha +" fmla v13.2d,v19.2d,v6.d[0] \n\t" // Scale by alpha +" \n\t" +" str q8, [x21] \n\t" //Store column 2 of C +" str q9, [x21, #16] \n\t" +" str q10, [x21, #32] \n\t" +" \n\t" +" str q11, [x22] \n\t" //Store column 3 of C +" str q12, [x22, #16] \n\t" +" str q13, [x22, #32] \n\t" +" \n\t" +" dup v0.2d, xzr \n\t" +" dup v1.2d, xzr \n\t" +" dup v2.2d, xzr \n\t" +" dup v3.2d, xzr \n\t" +" dup v4.2d, xzr \n\t" +" dup v5.2d, xzr \n\t" +" \n\t" +" fcmp d7,#0.0 \n\t" +" beq .DBETAZEROCOLSTOREDS3 \n\t" // Taking care of the beta==0 case. +" \n\t" +" ldr q0, [x23] \n\t" //Load column 4 of C +" ldr q1, [x23, #16] \n\t" +" ldr q2, [x23, #32] \n\t" +" \n\t" +" ldr q3, [x24] \n\t" //Load column 5 of C +" ldr q4, [x24, #16] \n\t" +" ldr q5, [x24, #32] \n\t" +" \n\t" +" fmul v0.2d,v0.2d,v7.d[0] \n\t" // Scale by beta +" fmul v1.2d,v1.2d,v7.d[0] \n\t" // Scale by beta +" fmul v2.2d,v2.2d,v7.d[0] \n\t" // Scale by beta +" fmul v3.2d,v3.2d,v7.d[0] \n\t" // Scale by beta +" fmul v4.2d,v4.2d,v7.d[0] \n\t" // Scale by beta +" fmul v5.2d,v5.2d,v7.d[0] \n\t" // Scale by beta +" \n\t" +" .DBETAZEROCOLSTOREDS3: \n\t" +" \n\t" +" fmla v0.2d,v20.2d,v6.d[0] \n\t" // Scale by alpha +" fmla v1.2d,v21.2d,v6.d[0] \n\t" // Scale by alpha +" fmla v2.2d,v22.2d,v6.d[0] \n\t" // Scale by alpha +" fmla v3.2d,v23.2d,v6.d[0] \n\t" // Scale by alpha +" fmla v4.2d,v24.2d,v6.d[0] \n\t" // Scale by alpha +" fmla v5.2d,v25.2d,v6.d[0] \n\t" // Scale by alpha +" \n\t" +" str q0, [x23] \n\t" //Store column 4 of C +" str q1, [x23, #16] \n\t" +" str q2, [x23, #32] \n\t" +" \n\t" +" str q3, [x24] \n\t" //Store column 5 of C +" str q4, [x24, #16] \n\t" +" str q5, [x24, #32] \n\t" +" \n\t" +" dup v8.2d, xzr \n\t" +" dup v9.2d, xzr \n\t" +" dup v10.2d, xzr \n\t" +" dup v11.2d, xzr \n\t" +" dup v12.2d, xzr \n\t" +" dup v13.2d, xzr \n\t" +" \n\t" +" fcmp d7,#0.0 \n\t" +" beq .DBETAZEROCOLSTOREDS4 \n\t" // Taking care of the beta==0 case. +" \n\t" +" ldr q8, [x25] \n\t" //Load column 6 of C +" ldr q9, [x25, #16] \n\t" +" ldr q10, [x25, #32] \n\t" +" \n\t" +" ldr q11, [x26] \n\t" //Load column 7 of C +" ldr q12, [x26, #16] \n\t" +" ldr q13, [x26, #32] \n\t" +" \n\t" +" fmul v8.2d, v8.2d, v7.d[0] \n\t" // Scale by beta +" fmul v9.2d, v9.2d, v7.d[0] \n\t" // Scale by beta +" fmul v10.2d,v10.2d,v7.d[0] \n\t" // Scale by beta +" fmul v11.2d,v11.2d,v7.d[0] \n\t" // Scale by beta +" fmul v12.2d,v12.2d,v7.d[0] \n\t" // Scale by beta +" fmul v13.2d,v13.2d,v7.d[0] \n\t" // Scale by beta +" \n\t" +" .DBETAZEROCOLSTOREDS4: \n\t" +" \n\t" +" prfm pldl2keep,[x3] \n\t" +" prfm pldl2keep,[x4] \n\t" +" \n\t" +" fmla v8.2d, v26.2d,v6.d[0] \n\t" // Scale by alpha +" fmla v9.2d, v27.2d,v6.d[0] \n\t" // Scale by alpha +" fmla v10.2d,v28.2d,v6.d[0] \n\t" // Scale by alpha +" fmla v11.2d,v29.2d,v6.d[0] \n\t" // Scale by alpha +" fmla v12.2d,v30.2d,v6.d[0] \n\t" // Scale by alpha +" fmla v13.2d,v31.2d,v6.d[0] \n\t" // Scale by alpha +" \n\t" +" str q8, [x25] \n\t" //Store column 6 of C +" str q9, [x25, #16] \n\t" +" str q10, [x25, #32] \n\t" +" \n\t" +" str q11, [x26] \n\t" //Store column 7 of C +" str q12, [x26, #16] \n\t" +" str q13, [x26, #32] \n\t" +" \n\t" +" b .DEND \n\t" " \n\t" " .DGENSTORED: \n\t" // C is general-stride stored. " \n\t" -" fcmp d31,#0.0 \n\t" -" beq .DBETAZEROGENSTORED \n\t" -" \n\t" // If beta!=0, then we can read from C. -" \n\t" // TODO: this was done fast. Rearrange to remove so many address reloads. -" ldr x2,%[caddr] \n\t" // Load address of C. +" dup v0.2d, xzr \n\t" +" dup v1.2d, xzr \n\t" +" dup v2.2d, xzr \n\t" +" dup v3.2d, xzr \n\t" +" dup v4.2d, xzr \n\t" +" dup v5.2d, xzr \n\t" " \n\t" -" ld1 {v0.d}[0],[x2],x14 \n\t" // Load c00 into quad and increment by rs_c. -" ld1 {v0.d}[1],[x2],x14 \n\t" // Load c01 into quad and increment by rs_c. -" ld1 {v1.d}[0],[x2],x14 \n\t" // Load c02 into quad and increment by rs_c. -" ld1 {v1.d}[1],[x2],x14 \n\t" // Load c03 into quad and increment by rs_c. +" fcmp d7,#0.0 \n\t" +" beq .DBETAZEROGENSTOREDS1 \n\t" // Taking care of the beta==0 case. " \n\t" -" ldr x2,%[caddr] \n\t" // Load address of C. -" add x2,x2,x10 \n\t" // c += cs_c. +" mov x27, x2 \n\t" +" \n\t" // Load address of C. +" ld1 {v0.d}[0],[x27],x14 \n\t" // Load c00 into quad and increment by rs_c. +" ld1 {v0.d}[1],[x27],x14 \n\t" // Load c01 into quad and increment by rs_c. +" ld1 {v1.d}[0],[x27],x14 \n\t" // Load c02 into quad and increment by rs_c. +" ld1 {v1.d}[1],[x27],x14 \n\t" // Load c03 into quad and increment by rs_c. +" ld1 {v2.d}[0],[x27],x14 \n\t" // Load c04 into quad and increment by rs_c. +" ld1 {v2.d}[1],[x27],x14 \n\t" // Load c05 into quad and increment by rs_c. " \n\t" -" ld1 {v2.d}[0],[x2],x14 \n\t" // Load c10 into quad and increment by rs_c. -" ld1 {v2.d}[1],[x2],x14 \n\t" // Load c11 into quad and increment by rs_c. -" ld1 {v3.d}[0],[x2],x14 \n\t" // Load c12 into quad and increment by rs_c. -" ld1 {v3.d}[1],[x2],x14 \n\t" // Load c13 into quad and increment by rs_c. +" mov x27, x20 \n\t" // Load address of C. " \n\t" -" ldr x2,%[caddr] \n\t" // Load address of C. -" add x2,x2,x10 \n\t" // c += cs_c. -" add x2,x2,x10 \n\t" // c += cs_c. +" ld1 {v3.d}[0],[x27],x14 \n\t" // Load c10 into quad and increment by rs_c. +" ld1 {v3.d}[1],[x27],x14 \n\t" // Load c11 into quad and increment by rs_c. +" ld1 {v4.d}[0],[x27],x14 \n\t" // Load c12 into quad and increment by rs_c. +" ld1 {v4.d}[1],[x27],x14 \n\t" // Load c13 into quad and increment by rs_c. +" ld1 {v5.d}[0],[x27],x14 \n\t" // Load c14 into quad and increment by rs_c. +" ld1 {v5.d}[1],[x27],x14 \n\t" // Load c15 into quad and increment by rs_c. " \n\t" -" ld1 {v4.d}[0],[x2],x14 \n\t" // Load c10 into quad and increment by rs_c. -" ld1 {v4.d}[1],[x2],x14 \n\t" // Load c11 into quad and increment by rs_c. -" ld1 {v5.d}[0],[x2],x14 \n\t" // Load c12 into quad and increment by rs_c. -" ld1 {v5.d}[1],[x2],x14 \n\t" // Load c13 into quad and increment by rs_c. +" fmul v0.2d,v0.2d,v7.d[0] \n\t" // Scale by beta +" fmul v1.2d,v1.2d,v7.d[0] \n\t" // Scale by beta +" fmul v2.2d,v2.2d,v7.d[0] \n\t" // Scale by beta +" fmul v3.2d,v3.2d,v7.d[0] \n\t" // Scale by beta +" fmul v4.2d,v4.2d,v7.d[0] \n\t" // Scale by beta +" fmul v5.2d,v5.2d,v7.d[0] \n\t" // Scale by beta " \n\t" -" ldr x2,%[caddr] \n\t" // Load address of C. -" add x2,x2,x10 \n\t" // c += cs_c. -" add x2,x2,x10 \n\t" // c += cs_c. -" add x2,x2,x10 \n\t" // c += cs_c. +" .DBETAZEROGENSTOREDS1: \n\t" " \n\t" -" ld1 {v6.d}[0],[x2],x14 \n\t" // Load c10 into quad and increment by rs_c. -" ld1 {v6.d}[1],[x2],x14 \n\t" // Load c11 into quad and increment by rs_c. -" ld1 {v7.d}[0],[x2],x14 \n\t" // Load c12 into quad and increment by rs_c. -" ld1 {v7.d}[1],[x2],x14 \n\t" // Load c13 into quad and increment by rs_c. +" fmla v0.2d,v8.2d,v6.d[0] \n\t" // Scale by alpha +" fmla v1.2d,v9.2d,v6.d[0] \n\t" // Scale by alpha +" fmla v2.2d,v10.2d,v6.d[0] \n\t" // Scale by alpha +" fmla v3.2d,v11.2d,v6.d[0] \n\t" // Scale by alpha +" fmla v4.2d,v12.2d,v6.d[0] \n\t" // Scale by alpha +" fmla v5.2d,v13.2d,v6.d[0] \n\t" // Scale by alpha " \n\t" -" prfm pldl1keep,[x16,0] \n\t" // Prefetch. -" prfm pldl1keep,[x17,0] \n\t" // Prefetch. +" mov x27, x2 \n\t" // Load address of C. " \n\t" -" fmul v20.2d,v0.2d,v31.d[0] \n\t" // Scale by beta -" fmul v21.2d,v1.2d,v31.d[0] \n\t" // Scale by beta -" fmul v22.2d,v2.2d,v31.d[0] \n\t" // Scale by beta -" fmul v23.2d,v3.2d,v31.d[0] \n\t" // Scale by beta -" fmul v24.2d,v4.2d,v31.d[0] \n\t" // Scale by beta -" fmul v25.2d,v5.2d,v31.d[0] \n\t" // Scale by beta -" fmul v26.2d,v6.2d,v31.d[0] \n\t" // Scale by beta -" fmul v27.2d,v7.2d,v31.d[0] \n\t" // Scale by beta +" st1 {v0.d}[0],[x27],x14 \n\t" // Store c00 into quad and increment by rs_c. +" st1 {v0.d}[1],[x27],x14 \n\t" // Store c01 into quad and increment by rs_c. +" st1 {v1.d}[0],[x27],x14 \n\t" // Store c02 into quad and increment by rs_c. +" st1 {v1.d}[1],[x27],x14 \n\t" // Store c03 into quad and increment by rs_c. +" st1 {v2.d}[0],[x27],x14 \n\t" // Store c04 into quad and increment by rs_c. +" st1 {v2.d}[1],[x27],x14 \n\t" // Store c05 into quad and increment by rs_c. " \n\t" -" .DBETAZEROGENSTORED: \n\t" // If beta==0, we cannot read from C (nor scale). +" mov x27, x20 \n\t" // Load address of C. " \n\t" -" fmla v20.2d,v12.2d,v30.d[0] \n\t" // Scale by alpha -" fmla v21.2d,v13.2d,v30.d[0] \n\t" // Scale by alpha -" fmla v22.2d,v14.2d,v30.d[0] \n\t" // Scale by alpha -" fmla v23.2d,v15.2d,v30.d[0] \n\t" // Scale by alpha -" fmla v24.2d,v16.2d,v30.d[0] \n\t" // Scale by alpha -" fmla v25.2d,v17.2d,v30.d[0] \n\t" // Scale by alpha -" fmla v26.2d,v18.2d,v30.d[0] \n\t" // Scale by alpha -" fmla v27.2d,v19.2d,v30.d[0] \n\t" // Scale by alpha +" st1 {v3.d}[0],[x27],x14 \n\t" // Store c10 into quad and increment by rs_c. +" st1 {v3.d}[1],[x27],x14 \n\t" // Store c11 into quad and increment by rs_c. +" st1 {v4.d}[0],[x27],x14 \n\t" // Store c12 into quad and increment by rs_c. +" st1 {v4.d}[1],[x27],x14 \n\t" // Store c13 into quad and increment by rs_c. +" st1 {v5.d}[0],[x27],x14 \n\t" // Store c14 into quad and increment by rs_c. +" st1 {v5.d}[1],[x27],x14 \n\t" // Store c15 into quad and increment by rs_c. " \n\t" -" ldr x2,%[caddr] \n\t" // Load address of C. +" dup v8.2d, xzr \n\t" +" dup v9.2d, xzr \n\t" +" dup v10.2d, xzr \n\t" +" dup v11.2d, xzr \n\t" +" dup v12.2d, xzr \n\t" +" dup v13.2d, xzr \n\t" " \n\t" -" st1 {v20.d}[0],[x2],x14 \n\t" // Store c00 into quad and increment by rs_c. -" st1 {v20.d}[1],[x2],x14 \n\t" // Store c01 into quad and increment by rs_c. -" st1 {v21.d}[0],[x2],x14 \n\t" // Store c02 into quad and increment by rs_c. -" st1 {v21.d}[1],[x2],x14 \n\t" // Store c03 into quad and increment by rs_c. +" fcmp d7,#0.0 \n\t" +" beq .DBETAZEROGENSTOREDS2 \n\t" // Taking care of the beta==0 case. " \n\t" -" ldr x2,%[caddr] \n\t" // Load address of C. -" add x2,x2,x10 \n\t" // c += cs_c. +" mov x27, x21 \n\t" // Load address of C. " \n\t" -" st1 {v22.d}[0],[x2],x14 \n\t" // Store c10 into quad and increment by rs_c. -" st1 {v22.d}[1],[x2],x14 \n\t" // Store c11 into quad and increment by rs_c. -" st1 {v23.d}[0],[x2],x14 \n\t" // Store c12 into quad and increment by rs_c. -" st1 {v23.d}[1],[x2],x14 \n\t" // Store c13 into quad and increment by rs_c. +" ld1 {v8.d}[0], [x27],x14 \n\t" // Load c20 into quad and increment by rs_c. +" ld1 {v8.d}[1], [x27],x14 \n\t" // Load c21 into quad and increment by rs_c. +" ld1 {v9.d}[0], [x27],x14 \n\t" // Load c22 into quad and increment by rs_c. +" ld1 {v9.d}[1], [x27],x14 \n\t" // Load c23 into quad and increment by rs_c. +" ld1 {v10.d}[0],[x27],x14 \n\t" // Load c24 into quad and increment by rs_c. +" ld1 {v10.d}[1],[x27],x14 \n\t" // Load c25 into quad and increment by rs_c. " \n\t" -" ldr x2,%[caddr] \n\t" // Load address of C. -" add x2,x2,x10 \n\t" // c += cs_c. -" add x2,x2,x10 \n\t" // c += cs_c. +" mov x27, x22 \n\t" // Load address of C. " \n\t" -" st1 {v24.d}[0],[x2],x14 \n\t" // Store c10 into quad and increment by rs_c. -" st1 {v24.d}[1],[x2],x14 \n\t" // Store c11 into quad and increment by rs_c. -" st1 {v25.d}[0],[x2],x14 \n\t" // Store c12 into quad and increment by rs_c. -" st1 {v25.d}[1],[x2],x14 \n\t" // Store c13 into quad and increment by rs_c. +" ld1 {v11.d}[0],[x27],x14 \n\t" // Load c30 into quad and increment by rs_c. +" ld1 {v11.d}[1],[x27],x14 \n\t" // Load c31 into quad and increment by rs_c. +" ld1 {v12.d}[0],[x27],x14 \n\t" // Load c32 into quad and increment by rs_c. +" ld1 {v12.d}[1],[x27],x14 \n\t" // Load c33 into quad and increment by rs_c. +" ld1 {v13.d}[0],[x27],x14 \n\t" // Load c34 into quad and increment by rs_c. +" ld1 {v13.d}[1],[x27],x14 \n\t" // Load c35 into quad and increment by rs_c. " \n\t" -" ldr x2,%[caddr] \n\t" // Load address of C. -" add x2,x2,x10 \n\t" // c += cs_c. -" add x2,x2,x10 \n\t" // c += cs_c. -" add x2,x2,x10 \n\t" // c += cs_c. +" fmul v8.2d, v8.2d, v7.d[0] \n\t" // Scale by beta +" fmul v9.2d, v9.2d, v7.d[0] \n\t" // Scale by beta +" fmul v10.2d,v10.2d,v7.d[0] \n\t" // Scale by beta +" fmul v11.2d,v11.2d,v7.d[0] \n\t" // Scale by beta +" fmul v12.2d,v12.2d,v7.d[0] \n\t" // Scale by beta +" fmul v13.2d,v13.2d,v7.d[0] \n\t" // Scale by beta " \n\t" -" st1 {v26.d}[0],[x2],x14 \n\t" // Store c10 into quad and increment by rs_c. -" st1 {v26.d}[1],[x2],x14 \n\t" // Store c11 into quad and increment by rs_c. -" st1 {v27.d}[0],[x2],x14 \n\t" // Store c12 into quad and increment by rs_c. -" st1 {v27.d}[1],[x2],x14 \n\t" // Store c13 into quad and increment by rs_c. +" .DBETAZEROGENSTOREDS2: \n\t" +" \n\t" +" fmla v8.2d, v14.2d,v6.d[0] \n\t" // Scale by alpha +" fmla v9.2d, v15.2d,v6.d[0] \n\t" // Scale by alpha +" fmla v10.2d,v16.2d,v6.d[0] \n\t" // Scale by alpha +" fmla v11.2d,v17.2d,v6.d[0] \n\t" // Scale by alpha +" fmla v12.2d,v18.2d,v6.d[0] \n\t" // Scale by alpha +" fmla v13.2d,v19.2d,v6.d[0] \n\t" // Scale by alpha +" \n\t" +" mov x27, x21 \n\t" // Load address of C. +" \n\t" +" st1 {v8.d}[0], [x27],x14 \n\t" // Store c20 into quad and increment by rs_c. +" st1 {v8.d}[1], [x27],x14 \n\t" // Store c21 into quad and increment by rs_c. +" st1 {v9.d}[0], [x27],x14 \n\t" // Store c22 into quad and increment by rs_c. +" st1 {v9.d}[1], [x27],x14 \n\t" // Store c23 into quad and increment by rs_c. +" st1 {v10.d}[0],[x27],x14 \n\t" // Store c24 into quad and increment by rs_c. +" st1 {v10.d}[1],[x27],x14 \n\t" // Store c25 into quad and increment by rs_c. +" \n\t" +" mov x27, x22 \n\t" // Load address of C. +" \n\t" +" st1 {v11.d}[0],[x27],x14 \n\t" // Store c30 into quad and increment by rs_c. +" st1 {v11.d}[1],[x27],x14 \n\t" // Store c31 into quad and increment by rs_c. +" st1 {v12.d}[0],[x27],x14 \n\t" // Store c32 into quad and increment by rs_c. +" st1 {v12.d}[1],[x27],x14 \n\t" // Store c33 into quad and increment by rs_c. +" st1 {v13.d}[0],[x27],x14 \n\t" // Store c34 into quad and increment by rs_c. +" st1 {v13.d}[1],[x27],x14 \n\t" // Store c35 into quad and increment by rs_c. +" \n\t" +" dup v0.2d, xzr \n\t" +" dup v1.2d, xzr \n\t" +" dup v2.2d, xzr \n\t" +" dup v3.2d, xzr \n\t" +" dup v4.2d, xzr \n\t" +" dup v5.2d, xzr \n\t" +" \n\t" +" fcmp d7,#0.0 \n\t" +" beq .DBETAZEROGENSTOREDS3 \n\t" // Taking care of the beta==0 case. +" \n\t" +" mov x27, x23 \n\t" // Load address of C. +" \n\t" +" ld1 {v0.d}[0],[x27],x14 \n\t" // Load c40 into quad and increment by rs_c. +" ld1 {v0.d}[1],[x27],x14 \n\t" // Load c41 into quad and increment by rs_c. +" ld1 {v1.d}[0],[x27],x14 \n\t" // Load c42 into quad and increment by rs_c. +" ld1 {v1.d}[1],[x27],x14 \n\t" // Load c43 into quad and increment by rs_c. +" ld1 {v2.d}[0],[x27],x14 \n\t" // Load c44 into quad and increment by rs_c. +" ld1 {v2.d}[1],[x27],x14 \n\t" // Load c45 into quad and increment by rs_c. +" \n\t" +" mov x27, x24 \n\t" // Load address of C. +" \n\t" +" ld1 {v3.d}[0],[x27],x14 \n\t" // Load c50 into quad and increment by rs_c. +" ld1 {v3.d}[1],[x27],x14 \n\t" // Load c51 into quad and increment by rs_c. +" ld1 {v4.d}[0],[x27],x14 \n\t" // Load c52 into quad and increment by rs_c. +" ld1 {v4.d}[1],[x27],x14 \n\t" // Load c53 into quad and increment by rs_c. +" ld1 {v5.d}[0],[x27],x14 \n\t" // Load c54 into quad and increment by rs_c. +" ld1 {v5.d}[1],[x27],x14 \n\t" // Load c55 into quad and increment by rs_c. +" \n\t" +" fmul v0.2d,v0.2d,v7.d[0] \n\t" // Scale by beta +" fmul v1.2d,v1.2d,v7.d[0] \n\t" // Scale by beta +" fmul v2.2d,v2.2d,v7.d[0] \n\t" // Scale by beta +" fmul v3.2d,v3.2d,v7.d[0] \n\t" // Scale by beta +" fmul v4.2d,v4.2d,v7.d[0] \n\t" // Scale by beta +" fmul v5.2d,v5.2d,v7.d[0] \n\t" // Scale by beta +" \n\t" +" .DBETAZEROGENSTOREDS3: \n\t" +" \n\t" +" fmla v0.2d,v20.2d,v6.d[0] \n\t" // Scale by alpha +" fmla v1.2d,v21.2d,v6.d[0] \n\t" // Scale by alpha +" fmla v2.2d,v22.2d,v6.d[0] \n\t" // Scale by alpha +" fmla v3.2d,v23.2d,v6.d[0] \n\t" // Scale by alpha +" fmla v4.2d,v24.2d,v6.d[0] \n\t" // Scale by alpha +" fmla v5.2d,v25.2d,v6.d[0] \n\t" // Scale by alpha +" \n\t" +" mov x27, x23 \n\t" // Load address of C. +" \n\t" +" st1 {v0.d}[0],[x27],x14 \n\t" // Store c40 into quad and increment by rs_c. +" st1 {v0.d}[1],[x27],x14 \n\t" // Store c41 into quad and increment by rs_c. +" st1 {v1.d}[0],[x27],x14 \n\t" // Store c42 into quad and increment by rs_c. +" st1 {v1.d}[1],[x27],x14 \n\t" // Store c43 into quad and increment by rs_c. +" st1 {v2.d}[0],[x27],x14 \n\t" // Store c44 into quad and increment by rs_c. +" st1 {v2.d}[1],[x27],x14 \n\t" // Store c45 into quad and increment by rs_c. +" \n\t" +" mov x27, x24 \n\t" // Load address of C. +" \n\t" +" st1 {v3.d}[0],[x27],x14 \n\t" // Store c50 into quad and increment by rs_c. +" st1 {v3.d}[1],[x27],x14 \n\t" // Store c51 into quad and increment by rs_c. +" st1 {v4.d}[0],[x27],x14 \n\t" // Store c52 into quad and increment by rs_c. +" st1 {v4.d}[1],[x27],x14 \n\t" // Store c53 into quad and increment by rs_c. +" st1 {v5.d}[0],[x27],x14 \n\t" // Store c54 into quad and increment by rs_c. +" st1 {v5.d}[1],[x27],x14 \n\t" // Store c55 into quad and increment by rs_c. +" \n\t" +" dup v8.2d, xzr \n\t" +" dup v9.2d, xzr \n\t" +" dup v10.2d, xzr \n\t" +" dup v11.2d, xzr \n\t" +" dup v12.2d, xzr \n\t" +" dup v13.2d, xzr \n\t" +" \n\t" +" fcmp d7,#0.0 \n\t" +" beq .DBETAZEROGENSTOREDS4 \n\t" // Taking care of the beta==0 case. +" \n\t" +" mov x27, x25 \n\t" +" \n\t" +" ld1 {v8.d}[0], [x27],x14 \n\t" // Load c60 into quad and increment by rs_c. +" ld1 {v8.d}[1], [x27],x14 \n\t" // Load c61 into quad and increment by rs_c. +" ld1 {v9.d}[0], [x27],x14 \n\t" // Load c62 into quad and increment by rs_c. +" ld1 {v9.d}[1], [x27],x14 \n\t" // Load c63 into quad and increment by rs_c. +" ld1 {v10.d}[0],[x27],x14 \n\t" // Load c64 into quad and increment by rs_c. +" ld1 {v10.d}[1],[x27],x14 \n\t" // Load c65 into quad and increment by rs_c. +" \n\t" +" mov x27, x26 \n\t" // Load address of C. +" \n\t" +" ld1 {v11.d}[0],[x27],x14 \n\t" // Load c70 into quad and increment by rs_c. +" ld1 {v11.d}[1],[x27],x14 \n\t" // Load c71 into quad and increment by rs_c. +" ld1 {v12.d}[0],[x27],x14 \n\t" // Load c72 into quad and increment by rs_c. +" ld1 {v12.d}[1],[x27],x14 \n\t" // Load c73 into quad and increment by rs_c. +" ld1 {v13.d}[0],[x27],x14 \n\t" // Load c74 into quad and increment by rs_c. +" ld1 {v13.d}[1],[x27],x14 \n\t" // Load c75 into quad and increment by rs_c. +" \n\t" +" fmul v8.2d, v8.2d, v7.d[0] \n\t" // Scale by beta +" fmul v9.2d, v9.2d, v7.d[0] \n\t" // Scale by beta +" fmul v10.2d,v10.2d,v7.d[0] \n\t" // Scale by beta +" fmul v11.2d,v11.2d,v7.d[0] \n\t" // Scale by beta +" fmul v12.2d,v12.2d,v7.d[0] \n\t" // Scale by beta +" fmul v13.2d,v13.2d,v7.d[0] \n\t" // Scale by beta +" \n\t" +" .DBETAZEROGENSTOREDS4: \n\t" +" \n\t" +" prfm pldl2keep,[x3] \n\t" +" prfm pldl2keep,[x4] \n\t" +" \n\t" +" fmla v8.2d, v26.2d,v6.d[0] \n\t" // Scale by alpha +" fmla v9.2d, v27.2d,v6.d[0] \n\t" // Scale by alpha +" fmla v10.2d,v28.2d,v6.d[0] \n\t" // Scale by alpha +" fmla v11.2d,v29.2d,v6.d[0] \n\t" // Scale by alpha +" fmla v12.2d,v30.2d,v6.d[0] \n\t" // Scale by alpha +" fmla v13.2d,v31.2d,v6.d[0] \n\t" // Scale by alpha +" \n\t" +" mov x27, x25 \n\t" // Load address of C. +" \n\t" +" st1 {v8.d}[0], [x27],x14 \n\t" // Store c60 into quad and increment by rs_c. +" st1 {v8.d}[1], [x27],x14 \n\t" // Store c61 into quad and increment by rs_c. +" st1 {v9.d}[0], [x27],x14 \n\t" // Store c62 into quad and increment by rs_c. +" st1 {v9.d}[1], [x27],x14 \n\t" // Store c63 into quad and increment by rs_c. +" st1 {v10.d}[0],[x27],x14 \n\t" // Store c64 into quad and increment by rs_c. +" st1 {v10.d}[1],[x27],x14 \n\t" // Store c65 into quad and increment by rs_c. +" \n\t" +" mov x27, x26 \n\t" // Load address of C. +" \n\t" +" st1 {v11.d}[0],[x27],x14 \n\t" // Store c70 into quad and increment by rs_c. +" st1 {v11.d}[1],[x27],x14 \n\t" // Store c71 into quad and increment by rs_c. +" st1 {v12.d}[0],[x27],x14 \n\t" // Store c72 into quad and increment by rs_c. +" st1 {v12.d}[1],[x27],x14 \n\t" // Store c73 into quad and increment by rs_c. +" st1 {v13.d}[0],[x27],x14 \n\t" // Store c74 into quad and increment by rs_c. +" st1 {v13.d}[1],[x27],x14 \n\t" // Store c75 into quad and increment by rs_c. " \n\t" " .DEND: \n\t" // Done! " \n\t" @@ -784,10 +2047,12 @@ __asm__ volatile [a_next] "m" (a_next), // 8 [b_next] "m" (b_next) // 9 :// Register clobber list - "x0","x1","x2", + "x0","x1","x2","x3", "x4","x5","x6", "x7","x8","x9", "x10","x11","x12","x13","x14","x16","x17", + "x20","x21","x22","x23","x24","x25","x26", + "x27", "v0","v1","v2", "v3","v4","v5", "v6","v7","v8", @@ -796,7 +2061,7 @@ __asm__ volatile "v15","v16","v17","v18","v19", "v20","v21","v22","v23", "v24","v25","v26","v27", - "v30","v31" + "v28","v29","v30","v31" );